-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_Anonimization.py
410 lines (283 loc) · 13.8 KB
/
5_Anonimization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
import pandas as pd
import numpy as np
# Read data
df = pd.read_pickle(r'data\final_fiscrep_deleted.pickle')
# Save original data
o_df = df.copy()
# Create a new DataFrame for unique values of CFR
matched_CFR = pd.DataFrame(df.matched_CFR.unique(), columns=['CFR'])
# Rename columns
df.rename(columns = {'matched_CFR':'CFR', 'Vessel_Type_y':'Vessel_Type'}, inplace = True)
# Just disclose years
df['Date of entry into service'] = df['Date of entry into service'].apply(lambda x: int(x.year) if not pd.isnull(x) else np.nan)
df['Year of construction'] = df['Year of construction'].apply(lambda x: int(x.year) if not pd.isnull(x) else np.nan)
# And more deletion after k-identifier analysis
df = df.drop(columns=['Place of registration',#not necessary
#'IRCS indicator', #deleted
#'Licence indicator', #deleted
#'VMS indicator', #deleted
#'ERS indicator', #deleted
#'AIS indicator', #deleted
#'Subsidiary fishing gear 1', # deleted
#'Subsidiary fishing gear 2', # deleted
#'Subsidiary fishing gear 3', # deleted
#'Subsidiary fishing gear 4', # deleted
#'Subsidiary fishing gear 5', # deleted
#'Segment', #deleted
#'Public aid', #deleted
'Hour', #delete to only stay Period
])
seed_r = 1 # Value not used for anonymization since it should be disclosed for identification protection.
np.random.seed(seed_r)
alpha = np.random.uniform(0.2,0.4)
# Random Noise Addition
sigma_date = np.std([x for x in df['Date of entry into service'] if not pd.isnull(x)])
sigma_year = np.std([x for x in df['Year of construction'] if not pd.isnull(x)])
np.random.seed(seed_r)
df['Date of entry into service'] = df['Date of entry into service'].apply(lambda x: np.round(np.random.normal(loc=x, scale=alpha*sigma_date),0) if not pd.isnull(x) else np.nan)
np.random.seed(seed_r)
df['Year of construction'] = df['Year of construction'].apply(lambda x: np.round(np.random.normal(loc=x, scale=alpha*sigma_year),0) if not pd.isnull(x) else np.nan)
# Noise Addition
sigma_LOA = df['LOA'].std()
sigma_LBP = df['LBP'].std()
np.random.seed(seed_r)
df['LOA']=df['LOA'].apply(lambda x: np.round(np.random.normal(loc=x, scale=alpha*sigma_LOA),1) if not pd.isnull(x) else np.nan)
np.random.seed(seed_r)
df['LBP']=df['LBP'].apply(lambda x: np.round(np.random.normal(loc=x, scale=alpha*sigma_LBP),1) if not pd.isnull(x) else np.nan)
df['LOA'] = [loa if loa>0 else o_df.LOA.min() if loa==0 else np.abs(loa) for loa in df['LOA']]
df['LBP'] = [lbp if lbp>0 else o_df.LBP.min() if lbp==0 else np.abs(lbp) for lbp in df['LBP']]
#Rounding
df['Power of auxiliary engine'] = (df['Power of auxiliary engine']/10).round()*10
df['Other tonnage'] = df['Other tonnage'].round()
#Noise Addition
sigma_power = df['Power of main engine'].std()
np.random.seed(seed_r)
df['Power of main engine'] = df['Power of main engine'].apply(lambda x: np.round(np.random.normal(loc=x, scale=alpha*sigma_power),1) if not pd.isnull(x) else np.nan)
df['Power of main engine'] = [power if power>0 else o_df['Power of main engine'].min() if power==0 else np.abs(power) for power in df['Power of main engine']]
sigma_ton = df['Tonnage GT'].std()
np.random.seed(seed_r)
df['Tonnage GT'] = df['Tonnage GT'].apply(lambda x: np.round(np.random.normal(loc=x, scale=alpha*sigma_ton),1) if not pd.isnull(x) else np.nan)
df['Tonnage GT'] = [ton if ton>0 else o_df['Tonnage GT'].min() if ton==0 else np.abs(ton) for ton in df['Tonnage GT']]
# Plot difference
import matplotlib.pyplot as plt
# Create the figure and axes objects
fig, ax = plt.subplots()
# Set the plot title and axis labels
ax.set_title('Scatter plot of Date of Entry into Service')
ax.set_xlabel('Date of entry into service (Original)')
ax.set_ylabel('Date of Entry into Service (modified)')
# Customize the scatter plot
ax.scatter(df['Date of entry into service'], o_df['Date of entry into service'], s=10, c='green', alpha=0.5, edgecolors='none')
# Show the plot
plt.show()
#Plot difference
diff_year = o_df['Year of construction']-pd.to_datetime(df['Year of construction'], format='%Y')
# Create the figure and axes objects
fig, ax = plt.subplots()
# Set the plot title and axis labels
ax.set_title('Scatter plot of Year of Construction')
ax.set_xlabel('Year of Construction (Original)')
ax.set_ylabel('Year of Construction (modified)')
# Customize the scatter plot
ax.scatter(o_df['Year of construction'], df['Year of construction'], s=10, c='green', alpha=0.5, edgecolors='none')
# Show the plot
plt.show()
# Create the figure and axes objects
fig, ax = plt.subplots()
# Set the plot title and axis labels
ax.set_title('Scatter plot of LOA')
ax.set_xlabel('LOA (Original)')
ax.set_ylabel('LOA (modified)')
# Customize the scatter plot
ax.scatter(o_df['LOA'], df['LOA'], s=10, c='blue', alpha=0.5, edgecolors='none')
# Show the plot
plt.show()
# Create the figure and axes objects
fig, ax = plt.subplots()
# Set the plot title and axis labels
ax.set_title('Scatter plot of LBP')
ax.set_xlabel('LBP (Original)')
ax.set_ylabel('LBP (modified)')
# Customize the scatter plot
ax.scatter(o_df['LBP'], df['LBP'], s=10, c='blue', alpha=0.5, edgecolors='none')
# Show the plot
plt.show()
# Create the figure and axes objects
fig, ax = plt.subplots()
# Set the plot title and axis labels
ax.set_title('Scatter plot of Power of main engine')
ax.set_xlabel('Power of main engine (Original)')
ax.set_ylabel('Power of main engine (modified)')
# Customize the scatter plot
ax.scatter(o_df['Power of main engine'], df['Power of main engine'], s=10, c='red', alpha=0.5, edgecolors='none')
# Show the plot
plt.show()
# Create the figure and axes objects
fig, ax = plt.subplots()
# Set the plot title and axis labels
ax.set_title('Scatter plot of Tonnage GT')
ax.set_xlabel('Tonnage GT (Original)')
ax.set_ylabel('Tonnage GT (modified)')
# Customize the scatter plot
ax.scatter(o_df['Tonnage GT'], df['Tonnage GT'], s=10, c='red', alpha=0.5, edgecolors='none')
# Show the plot
plt.show()
# load previous fk and Fk = 1 identifiers and alter
save_for_suppress = pd.read_pickle(r'data\supress.pickle')
save_for_suppress = set(save_for_suppress.CFR)
variables = ['Other tonnage'] # Vector of variables for example. Real one disclosed for protection.
index_supress = 0
for CFR in save_for_suppress:
mask = df['CFR'] == CFR
older_value = df.loc[mask, variables[index_supress]].values[0]
# Check if the value is a string
if isinstance(older_value, str):
df.loc[mask, variables[index_supress]] = np.nan
else:
closest_values = df[variables[index_supress]].drop_duplicates().sort_values(key=lambda x: abs(x - older_value)).iloc[1:4]
new_value = closest_values.mean()
df.loc[mask, variables[index_supress]] = np.round(new_value,0)
print(CFR)
print(variables[index_supress])
print(new_value)
print(older_value)
index_supress += 1
save_for_suppress2 = pd.read_pickle(r'data\supress2.pickle')
save_for_suppress = set(save_for_suppress2.CFR)
variables = ['Other tonnage'] # Vector of variables for example. Real one disclosed for protection.
index_supress = 0
for CFR in save_for_suppress:
mask = df['CFR'] == CFR
older_value = df.loc[mask, variables[index_supress]].values[0]
# Check if the value is a string
if isinstance(older_value, str):
df.loc[mask, variables[index_supress]] = np.nan
else:
closest_values = df[variables[index_supress]].drop_duplicates().sort_values(key=lambda x: abs(x - older_value)).iloc[1:4]
new_value = closest_values.mean()
df.loc[mask, variables[index_supress]] = np.round(new_value,0)
print(CFR)
print(variables[index_supress])
print(new_value)
print(older_value)
index_supress += 1
# Create a list to store the assigned codes
CFR_code = []
# Initialize the code number
number = 1
# Assign a code to each unique value of CFR leaving the non-CFR unchanged
for CFR in matched_CFR.CFR:
if CFR.startswith('NOCFR'):
CFR_code.append(CFR)
else:
CFR_code.append('CFR_'+str(number))
number += 1
# Add the assigned codes to the DataFrame
matched_CFR['code'] = CFR_code
# Replace the original values with the assigned codes in the original DataFrame
df['CFR'] = df['CFR'].map(dict(zip(matched_CFR['CFR'], matched_CFR['code'])))
# Create a new DataFrame for unique values of Unit
unit_unique = pd.DataFrame(df.Unit.unique(), columns=['Unit'])
# Create a list to store the assigned codes
unit_code = []
# Initialize the code number
number = 1
# Assign a code to each unique value of Unit
for unit in unit_unique['Unit']:
unit_code.append('Unit_'+str(number))
number += 1
# Add the assigned codes to the DataFrame
unit_unique['code'] = unit_code
# Replace the original values with the assigned codes in the original DataFrame
df['Unit'] = df['Unit'].map(unit_unique.set_index('Unit')['code']).fillna(df['Unit'])
import math
import random
from geopy import distance
import re
# Function to convert coordinates from degrees, minutes, and seconds format to decimal degrees
def dms_to_decimal(degrees, minutes, seconds):
return degrees + minutes/60 + seconds/3600
# Function to convert decimal degrees to degrees, minutes, and seconds format
def decimal_to_dms(decimal_degrees):
degrees = int(decimal_degrees)
decimal_minutes = (decimal_degrees - degrees) * 60
minutes = int(decimal_minutes)
seconds = (decimal_minutes - minutes) * 60
return f"{degrees}º{minutes}´{seconds:.2f}"
# Function to convert decimal degrees to degrees, minutes, and seconds format with direction
def decimal_to_dms_direction(decimal_degrees, direction):
degrees = int(decimal_degrees)
decimal_minutes = (decimal_degrees - degrees) * 60
minutes = int(decimal_minutes)
seconds = (decimal_minutes - minutes) * 60
return f"{degrees}º{minutes}´{seconds:.2f}{direction}"
# Random displacement distance in miles
displacement_miles = 0.5
# Iterate over the Latitude and Longitude columns in the DataFrame
for index, row in df.iterrows():
latitude_str = row['Latitude']
longitude_str = row['Longitude']
# Extract degrees, minutes, and seconds from the string using regular expressions
latitude_parts = re.findall(r'\d+\.\d+|\d+', latitude_str)
longitude_parts = re.findall(r'\d+\.\d+|\d+', longitude_str)
# Convert degrees, minutes, and seconds to decimal degrees
latitude = dms_to_decimal(float(latitude_parts[0]), float(latitude_parts[1]), float(latitude_parts[2]))
longitude = dms_to_decimal(float(longitude_parts[0]), float(longitude_parts[1]), float(longitude_parts[2]))
# Generate random displacement in latitude and longitude
random_latitude_displacement = random.uniform(-displacement_miles, displacement_miles)
random_longitude_displacement = random.uniform(-displacement_miles, displacement_miles)
# Perform the displacement by converting the distance to kilometers
new_latitude = latitude + (random_latitude_displacement / 69)
new_longitude = longitude + (random_longitude_displacement / (69 * abs(math.cos(math.radians(latitude)))))
# Convert the new latitude and longitude back to degrees, minutes, and seconds format with direction
new_latitude_str = decimal_to_dms_direction(new_latitude, latitude_str[-1])
new_longitude_str = decimal_to_dms_direction(new_longitude, longitude_str[-1])
# Update the DataFrame with the new coordinates
df.at[index, 'Latitude'] = new_latitude_str
df.at[index, 'Longitude'] = new_longitude_str
import matplotlib.pyplot as plt
import geopandas as gpd
# Calculate the number of records to select for plotting (10% of the total records)
num_records = int(len(o_df) * 0.1)
# Randomly select 10% of the records from o_df and df
random_indices = random.sample(range(len(o_df)), num_records)
o_df_sample = o_df.iloc[random_indices]
df_sample = df.iloc[random_indices]
def convert_coordinates(coord):
parts = re.split('[º´″]', coord[:-1])
degrees = int(parts[0])
minutes = int(parts[1])
seconds = float(parts[2])
direction = coord[-1]
decimal_degrees = degrees + (minutes / 60) + (seconds / 3600)
if direction in ['S', 'W']:
decimal_degrees *= -1
return decimal_degrees
# Extract latitude and longitude values from o_df and df
original_latitudes = o_df_sample['Latitude'].apply(convert_coordinates)
original_longitudes = o_df_sample['Longitude'].apply(convert_coordinates)
displaced_latitudes = df_sample['Latitude'].apply(convert_coordinates)
displaced_longitudes = df_sample['Longitude'].apply(convert_coordinates)
# Read the shapefile
shapefile_path = 'shapefiles\concelhos.shp'
data = gpd.read_file(shapefile_path)
# Plot the shapefile
data.plot()
# Plot the original and displaced coordinates
plt.scatter(original_longitudes, original_latitudes, color='blue', label='Original', s=10, alpha=0.5)
plt.scatter(displaced_longitudes, displaced_latitudes, color='red', label='Displaced', s=10, alpha=0.5)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Original and Displaced Coordinates')
plt.legend()
plt.grid(True)
plt.show()
# some other preprocessing detected made after
df.drop(columns=['Vessel_Type_x', 'Infrac_a','local'], inplace=True)
df['Result'].replace('TODOS', 'LEGAL', inplace=True)
df.rename(columns={'Art':'Gear'}, inplace=True)
#save fiscrep
df.to_pickle(r'data\final_fiscrep_anonimized.pickle')
df.to_csv(r'data\final_fiscrep_anonimized.csv')
o_df.to_pickle(r'data\final_fiscrep_original.pickle')
o_df.to_csv(r'data\final_fiscrep_original.csv')