Skip to content

Commit

Permalink
added bulk processing, simplified scripts/repo
Browse files Browse the repository at this point in the history
  • Loading branch information
ripnyt-ripnyt committed Feb 10, 2023
1 parent 4d1c08f commit 13b26e2
Show file tree
Hide file tree
Showing 4 changed files with 260 additions and 53 deletions.
18 changes: 13 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@
Reconstruction of WA's Cannabis Data
---

This is a project which I'm using to learn python, mysql, and methods to handle 150M+ lines of raw, TSV data with challenging encoding and types.
CCRS-Python-MySQL is a few python scripts which takes the raw [CCRS](https://lcb.wa.gov/ccrs) .zip download from a public records request and converts it from a rather dirty dataset to a somewhat usable set of csv with up to 1,000,000 lines each.

The repo is currenly in rough shape, so here's what's going on:
I'm using this learn python, mysql, and methods to handle 150M+ lines of raw, TSV data with challenging encoding and types. As I'm learning the code has been refactored to several times but here's my best shot so far.

Use the dataCleanup.py file to process the raw files from TSV, UTF-16-LE with many non-ASCII characters to a list of tuples for use with the most recent sql connection,
StrainsSQLConnect.py.
## How to use
---

There are two scripts one which inflates the zip and removes the folder hierarchy `inflate.py` and the other standardizes the row width `columnStandardization.py`.

Both read files from one directory and then writes to a new directory without compression. The CCRS data in csv form is 23 gb at this time, so be careful with disk space.

## Known Issues with the scripts and output data
---
No Compression, slow, and could possibly be changed to write to compression. Data will have issues too, the size of the dataset means that I haven't been able to confirm that my column standardization method is perfect.

Plan on adding these two files together when done debugging.
`InventoryAdjustment_0.csv` seems to have particular encoding issues.
104 changes: 104 additions & 0 deletions SQL/FileMaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
tableName = 'sale_items'
fileMap = {
'sale_item_id': "int",
'sale_id' : "int",
'inventory_item_id' : "int",
'plant_id' : "int",
'quantity' : "int",
'price' : "int",
'discount' : "int",
'sales_tax' : "int",
'other_tax' : "int",
'external_id' : "varchar",
'status' : "varchar",
'created_by' : "varchar",
'created_date' : "datetime",
'updated_by' : "varchar",
'updated_date' : "datetime"
}

tableName = 'areas'
fileMap = {
'area_id' : 'int',
'licensee_id' : 'int',
'name' : 'varchar',
'is_quarantine' : 'varchar',
'external_id' : 'varchar',
'status' : 'varchar',
'created_by' : 'varchar',
'created_date' : 'datetime',
'updated_by' : 'varchar',
'updated_date' : 'datetime'
}

tableName = 'lab_results'
fileMap = {
'lab_result_id' : "int",
'lab_licensee_id' : "int",
'licensee_id' : "int",
'lab_test_status' : "varchar",
'inventory_item_id' : "int",
'test_name' : "varchar",
'test_date' : "datetime",
'test_value' : "varchar",
'external_id' : "varchar",
'status' : "varchar",
'created_by' : "varchar",
'created_date' : "datetime",
'updated_by' : "varchar",
'updated_date' : "datetime"
}

tableName = 'licensees'
fileMap = {

'license_status' : "varchar",
'licensee_id' : "int",
'UBI' : "varchar",
'license_number' : "int",
'name' : "varchar",
'dba' : "varchar",
'license_issue_date' : "datetime",
'license_expiration_date' : "datetime",
'external_id' : "varchar",
'status' : "varchar",
'address_1' : "varchar",
'address_2' : "varchar",
'city' : "varchar",
'state' : "varchar",
'zip' : "varchar",
'county' : "varchar",
'email' : "varchar",
'phone' : "varchar",
'created_by' : "varchar",
'created_date' : "datetime",
'updated_by' : "varchar",
'updated_date' : "datetime"
}

tableName = 'sales'
fileMap = {
'sale_id' : "varchar",
'seller_licensee_id' : "varchar",
'purchaser_licensee_id' : "varchar",
'sale_type' : "varchar",
'sale_date' :"datetime",
'external_id' : "varchar",
'status' : "varchar",
'created_by' : "varchar",
'created_date' : "datetime",
'updated_by' : "varchar",
'updated_date' : "datetime"
}

tableName = 'strains'
fileMap = {
'strain_id' : 'int',
'strain_type' : 'varchar',
'name' : 'varchar',
'status' : 'varchar',
'created_by' : 'varchar',
'created_date' : 'datetime',
'updated_by' : 'varchar',
'updated_date' : 'datetime',
}
95 changes: 95 additions & 0 deletions columnStandardization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import csv
import os
import pandas as pd

# running this removed old files to save disk space, comment out the os.remove() line to retain them.
src = '/src/path/with/inflated/csvs'
dest = '/dest/path/for/standardized/column/width/csvs'

df_results = pd.DataFrame(columns=['file', 'columns_removed'])
df_bad_rows = pd.DataFrame(columns=['file', 'row', 'element', 'next_element'])


# Dialects for reading in .csv files.
class customDialect(csv.Dialect):
"""Describe the usual properties of Excel-generated CSV files."""
delimiter = '\t'
quotechar = '"'
doublequote = True
skipinitialspace = False
lineterminator = '\n'
quoting = csv.QUOTE_MINIMAL


csv.register_dialect("customDialect", customDialect)

# 'file prefix' : (length of row list, location of field which has delimiters)

file_types = {
'Areas_': (10, 2),
'Contacts_': (13, 0),
'Integrator_': (7, 1),
'Inventory_': (16, 1),
'InventoryAdjustment_': (12, 3),
'InventoryPlantTransfer_': (17, 0),
'LabResult_': (14, 6),
'Plant_': (18, 0),
'PlantDestructions_': (11, 7),
'Product_': (12, 3), # probably 4 as well, description. 3 is name
'SaleHeader_': (11, 5),
'SalesDetail_': (15, 9),
'Strains_': (8, 2),
'Licensee_': (22, 4), #maybe 5 as well, DBA
}

# since the product and licensee issue cells are adjacent, I can likely do nothing and they would be combined fine.

for filename in os.listdir(src):
reqColumns = 0
badLocation = 0
columns_removed = 0
for key in file_types:
if filename.startswith(key):
reqColumns = file_types[key][0]
badLocation = file_types[key][1]
f = os.path.join(src, filename)
filename_csv = filename[:-4] + ".csv"
o = os.path.join(dest, filename_csv)
print(f + ' is losing columns')
with open(f, 'rt', encoding='utf-8') as csvFile:
columns_removed = 0
with open(
o,
"a",
encoding="utf-8",
) as outputFile:
datareader = csv.reader(csvFile, dialect='customDialect')
datawriter = csv.writer(outputFile)
for row in datareader:
valuesList = []
for column in row:
valuesList.append(column)
if len(valuesList) < reqColumns:
print(valuesList)
while reqColumns < len(valuesList):
df_bad_row = pd.DataFrame(
[[
os.path.basename(f), row, valuesList[badLocation],
valuesList[badLocation + 1]
]],
columns=['file', 'row', 'element', 'next_element'])
valuesList[badLocation] = valuesList[
badLocation] + valuesList[badLocation + 1]
df_bad_rows = pd.concat([df_bad_rows, df_bad_row])
del valuesList[badLocation + 1]
columns_removed += 1
datawriter.writerow(valuesList)
df_result = pd.DataFrame(
[[os.path.basename(f), str(columns_removed)]],
columns=['file', 'columns_removed'])
df_results = pd.concat([df_results, df_result])
print(o + ' lost ' + str(columns_removed) + ' columns')
os.remove(f)

df_results.to_csv('df_results.csv', index=False)
df_bad_rows.to_csv('df_bad_rows.csv', index=False)
96 changes: 48 additions & 48 deletions inflate.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,50 @@
import zipfile
import shutil
import csv
import os
import codecs
import zipfile

folder = os.getcwd()

# Open the zip file

#get list of files
zip_file_list = [f for f in os.listdir(folder) if f.endswith(".zip")]


# Extract the csv file from the zip file

for zip_file in os.listdir(folder)
csv_name = zip_file.namelist()[0]
csv_file = zip_file.open(csv_name)

# Create a new tsv file
tsv_name = csv_name[:-3] + 'tsv'
tsv_file = open(tsv_name, 'w')

# Copy the contents of the csv file to the tsv file
shutil.copyfileobj(csv_file, tsv_file)

# Close the files
csv_file.close()
tsv_file.close()
zip_file.close()

# Remove the zip file
os.remove(zip_name)

#############################################

# From Chat GPT

#############################################

def extract_zip(zip_path, dest_dir):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for file in zip_ref.namelist():
# Extract the file to the destination directory
zip_ref.extract(file, dest_dir)
# Get the absolute file path
file_path = os.path.join(dest_dir, file)
# Split the file path into the directory and filename
dirname, filename = os.path.split(file_path)
# Write the file to the destination directory without preserving the hierarchy
shutil.move(file_path, os.path.join(dest_dir, filename))
src = "/src/folder/with/CCRS/Box/Dowload"
temp = "/temp/folder/for/intermediate/file/storage"
dest = "/dest/folder/for/all/inflated/files"


# Dialects for reading in .csv files.
class customDialect(csv.Dialect):
"""Describe the usual properties of Excel-generated CSV files."""
delimiter = '\t'
quotechar = '"'
doublequote = True
skipinitialspace = False
lineterminator = '\n'
quoting = csv.QUOTE_MINIMAL


csv.register_dialect("customDialect", customDialect)

# walk through all sub directories in a CCRS Box Download .zip, read the src as a utf-16-le and output a utf-8 file.
# temp os a temporary location

write_count = 0
for root, dirs, files in os.walk(src):
for file in files:
if file.endswith(".zip"):
with zipfile.ZipFile(os.path.join(root, file), 'r') as zip_ref:
zip_ref.extractall(temp)
extracted_file = zip_ref.namelist()[0]
extracted_file_base = os.path.basename(extracted_file)
extracted_file_path = os.path.join(temp, extracted_file)
with codecs.open(extracted_file_path,
encoding='utf-16-le',
errors='replace') as source_file:
dest_file_base = extracted_file_base.rsplit('\\', 1)[-1]
dest_file_base = dest_file_base[:-4] + ".tsv"
dest_file_path = os.path.join(dest, dest_file_base)
with codecs.open(dest_file_path, 'w',
encoding='utf-8') as dest_file:
for row in source_file:
row = ''.join(
[c if ord(c) < 128 else '' for c in row])
dest_file.write(row)
write_count += 1
os.remove(extracted_file_path)
print('files written: ' + str(write_count))

0 comments on commit 13b26e2

Please sign in to comment.