-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataOrganiser.py
370 lines (326 loc) · 10.1 KB
/
DataOrganiser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import sqlite3
import re
print('''Meteorite Data Explorer Copyright (C) 2018 DigiBrkr This program comes with ABSOLUTELY NO WARRANTY; \n \nThis is free software, and you are welcome to redistribute it under certain conditions. \n
See LICENSE.md for details. \n''')
loopCount = 0
#connect to raw.sqlite
rawConnection = sqlite3.connect('raw.sqlite')
rawCursor = rawConnection.cursor()
#make the new and faster database
indexDB = sqlite3.connect('index.sqlite')
indexCursor = indexDB.cursor()
#get rid of any exsiting tables
indexCursor.execute('DROP TABLE IF EXISTS Names')
indexCursor.execute('DROP TABLE IF EXISTS mID')
indexCursor.execute('DROP TABLE IF EXISTS Nametypes')
indexCursor.execute('DROP TABLE IF EXISTS Recclasses')
indexCursor.execute('DROP TABLE IF EXISTS Masses')
indexCursor.execute('DROP TABLE IF EXISTS Falls')
indexCursor.execute('DROP TABLE IF EXISTS Years')
indexCursor.execute('DROP TABLE IF EXISTS Reclats')
indexCursor.execute('DROP TABLE IF EXISTS Reclongs')
indexCursor.execute('DROP TABLE IF EXISTS Geolocations')
#make the new tables
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Names
(uID INTEGER PRIMARY KEY, Name TEXT )''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS mIDs
(uID INTEGER PRIMARY KEY, mID INTEGER )''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Nametypes
(uID INTEGER PRIMARY KEY, nametype TEXT )''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Recclasses
(uID INTEGER PRIMARY KEY, Recclass TEXT )''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Masses
(uID INTEGER PRIMARY KEY, Mass INT)''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Falls
(uID INTEGER PRIMARY KEY, Fall TEXT) ''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Years
(uID INTEGER PRIMARY KEY, year INTEGER) ''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Reclats
(uID INTEGER PRIMARY KEY, Reclat REAL) ''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Reclongs
(uID INTEGER PRIMARY KEY, Reclong REAL) ''')
indexCursor.execute('''CREATE TABLE IF NOT EXISTS Geolocations
(uID INTEGER PRIMARY KEY, Geolocation TEXT) ''')
#deal with names
rawCursor.execute('SELECT name FROM Meteorites')
names = rawCursor.fetchall()
print("Sanitizing Data. This may take a while depending on the speed of your computer. \n")
for name in names:
name = str(name)
#this code is hideous but it's the least ugly option to my knowledge
name = name.strip("(")
name = name.strip(")")
name = name.strip("'")
name = name.strip(",")
name = name.strip("'")
name = name.strip()
name = name.strip("'")
loopCount = loopCount + 1
indexCursor.execute('INSERT OR IGNORE INTO Names(Name) VALUES (?)', (name,))
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with mIDs
rawCursor.execute('SELECT id FROM Meteorites')
mIDs = rawCursor.fetchall()
for mID in mIDs:
#need to convert to a string to clean it up before we can go to an int
mID = str(mID)
mID = mID.strip("(")
mID = mID.strip(")")
mID = mID.strip(",")
#now we can convert to an int
mID = int(mID)
indexCursor.execute('INSERT OR IGNORE INTO mIDs(mID) VALUES (?)', (mID,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with Nametypes
rawCursor.execute('SELECT nametype FROM Meteorites')
nametypes = rawCursor.fetchall()
for nametype in nametypes:
#convert to string
nametype = str(nametype)
#de-junk
nametype = nametype.strip("(")
nametype = nametype.strip(")")
nametype = nametype.strip("'")
nametype = nametype.strip(",")
nametype = nametype.strip("'")
nametype = nametype.strip("'")
nametype = nametype.strip()
nametype = nametype.strip("'")
indexCursor.execute('INSERT OR IGNORE INTO Nametypes(nametype) VALUES (?)', (nametype,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with recclass
rawCursor.execute('SELECT recclass FROM Meteorites')
recclasses = rawCursor.fetchall()
for recclass in recclasses:
#convert to string
recclass = str(recclass)
#de-junk
recclass = recclass.strip("(")
recclass = recclass.strip(")")
recclass = recclass.strip("'")
recclass = recclass.strip("'")
recclass = recclass.strip(",")
recclass = recclass.strip("'")
recclass = recclass.strip()
recclass = recclass[1:]
recclass = recclass.strip("'")
indexCursor.execute('INSERT OR IGNORE INTO Recclasses(recclass) VALUES (?)', (recclass,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with mass
rawCursor.execute('SELECT mass FROM Meteorites')
masses = rawCursor.fetchall()
for mass in masses:
#we must convert to string and clean before we can go to an int
mass = str(mass)
mass = mass.strip("(")
mass = mass.strip(")")
mass = mass.strip("'")
mass = mass.strip("'")
mass = mass.strip(",")
mass = int(mass)
indexCursor.execute('INSERT OR IGNORE INTO Masses(mass) VALUES (?)', (mass,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with fall
rawCursor.execute('SELECT fall FROM Meteorites')
falls = rawCursor.fetchall()
for fall in falls:
#convert to string
fall = str(fall)
#de-junk
fall = fall.strip("(")
fall = fall.strip(")")
fall = fall.strip("'")
fall = fall.strip("'")
fall = fall.strip(",")
fall = fall.strip("'")
fall = fall.strip("'")
fall = fall.strip()
fall = fall.strip("'")
indexCursor.execute('INSERT OR IGNORE INTO Falls(Fall) VALUES (?)', (fall,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with year
rawCursor.execute('SELECT year FROM Meteorites')
years = rawCursor.fetchall()
for year in years:
#convert to string
year = str(year)
#de=junk
year = year.strip("(")
year = year.strip(")")
year = year.strip("'")
year = year.strip("'")
year = year.strip(",")
year = year.strip("'")
year = year.strip()
year = year.strip("'")
#year is in the floating timestamp format but it just has the year so we can throw away the rest
year = year.split("-")
year = year[0]
try:
#convert to int but be prepared for bad data
year = int(year)
except:
year = 0
indexCursor.execute('INSERT OR IGNORE INTO Years(year) VALUES (?)', (year,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with Reclats
rawCursor.execute('SELECT reclat FROM Meteorites')
reclats = rawCursor.fetchall()
for reclat in reclats:
#convert to string
reclat = str(reclat)
#de-junk
reclat = reclat.strip("(")
reclat = reclat.strip(")")
reclat = reclat.strip("'")
reclat = reclat.strip("'")
reclat = reclat.strip(",")
reclat = reclat.strip("'")
reclat = reclat.strip("u")
reclat = reclat.strip("'")
try:
#conver to float
reclat = float(reclat)
except:
reclat = 0
indexCursor.execute('INSERT OR IGNORE INTO Reclats(reclat) VALUES (?)', (reclat,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with reclongs
rawCursor.execute('SELECT reclong FROM Meteorites')
reclongs = rawCursor.fetchall()
for reclong in reclongs:
#convert to string
reclong = str(reclong)
#de-junk
reclong = reclong.strip("(")
reclong = reclong.strip(")")
reclong = reclong.strip("'")
reclong = reclong.strip("'")
reclong = reclong.strip(",")
reclong = reclong.strip("'")
reclong = reclong.strip("u")
reclong = reclong.strip("'")
try:
#Conver to float
reclong = float(reclong)
except:
reclong = 0
indexCursor.execute('INSERT OR IGNORE INTO Reclongs(reclong) VALUES (?)', (reclong,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
#deal with geolocations
rawCursor.execute('SELECT geolocation FROM Meteorites')
geolocations = rawCursor.fetchall()
for geolocation in geolocations:
#convert to string
geolocation = str(geolocation)
geolocation = geolocation.lstrip("{")
# type:Point,coordinates['")
# geolocation = geolocation.strip("[")
geolocation = geolocation.replace("{'type': 'Point', 'coordinates': ", "")
geolocation = geolocation.replace('(', '')
geolocation = geolocation.replace(')', '')
geolocation = geolocation.replace('[', '')
geolocation = geolocation.replace(']', '')
geolocation = geolocation.replace('}', '')
geolocation = geolocation.replace('"', '')
geolocation = geolocation[:-1]
# print(geolocation)
# geolocation = geolocation.strip("]}")
# geolocation = geolocation.split(",")
# try:
# geolocation = geolocation[1]
# except:
# geolocation = "none"
#geolocation = geolocation[:-9]
#geolocation = geolocation.strip()
#geolocation = geolocation.strip("[")
#set all bad data to "none"
if len(geolocation) <= 0:
geolocation = "none"
if geolocation == "0, 0":
geolocation = "none"
#print(geolocation)
indexCursor.execute('INSERT OR IGNORE INTO Geolocations(geolocation) VALUES (?)', (geolocation,))
loopCount = loopCount + 1
if loopCount == 10:
indexDB.commit()
loopCount = 0
continue
else:
continue
else:
loopCount = 0
indexDB.commit()
indexDB.close()
print("Data sanitisation complete. This Program will now exit.")