-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_followup_pipeline.py
113 lines (90 loc) · 3.71 KB
/
run_followup_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import ast
import re
import pandas as pd
import prodigy
import spacy
from spacy.training import Corpus
from spacy.tokens import DocBin
from spacy.language import Language
from Weighted_TextCategorizer import TextCategorizer
MODEL_PATH = "/home/vs428/Documents/Moore/followup_model/model-best"
INPUT_FILE = "/home/vs428/project/Moore_data/Chest_CT_Reports_HVRADS(1).csv"
VERBOSITY = 10
GPU = True
if GPU:
spacy.require_gpu()
def convert_dict_str_to_dict(x, col_name):
tmp = ast.literal_eval(x[col_name])
return tmp
nlp = spacy.load(MODEL_PATH)
data_in = pd.read_csv(INPUT_FILE)
if not "CT_text" in data_in.columns:
raise KeyError("'CT_text' is not a column in your input data")
if not "ID" in data_in.columns:
raise KeyError("'ID' is not a column in your input data")
_, ext = os.path.splitext(INPUT_FILE)
if ext[1:] not in ['csv', "tsv", "xlsx", "xls"]:
raise ValueError(f"{ext} file type not supported")
# Convert text into IMPRESSIONS only
impressions = []
for idx, text in data_in.loc[~data_in['CT_text'].isnull(), 'CT_text'].items():
try:
impression = text
if re.search(r"IMPRESSION:|Impression:", text):
idx_start = re.search(r"IMPRESSION:|Impression:", text).start()
else:
idx_start = 0
impression = text[idx_start:]
impressions.append(impression)
except ValueError:
impressions.append(ann)
data_in.loc[~data_in['CT_text'].isnull(), 'CT_text_Impressions'] = impressions
if VERBOSITY:
print("created impressions...", flush=True)
# taken from https://stackoverflow.com/a/44764557/1726404
'''
This works by using nlp.pipe and putting our records into tuples. We process it as tuples and get the context
In our work, the context is just the study id.
We get the entity text, label, start and stop characters for each entity
we convert that to a json string, we then put the [context,json] together into a list
append this list to nlp_out
then turn nlp out into a df with 1 col being study id and the other being the nlp out
Finally we merge the df with our main data df. Now we have a column with the text
'''
import json
nlp_out = []
count = 0
for doc, ctx in nlp.pipe(list(data_in.loc[~data_in['CT_text_Impressions'].isnull(),
['CT_text_Impressions', 'ID']].to_records(index=False)),
as_tuples=True, batch_size=200, n_process=1):
out_ = doc.cats
nlp_out.append([ctx, json.dumps(out_, indent = 2)])
if VERBOSITY:
count +=1
if count % 50 == 0 and VERBOSITY > 8:
print(count, flush=True)
elif count % 500 == 0 and VERBOSITY > 4:
print(count, flush=True)
elif count % 1000 == 0 and VERBOSITY > 2:
print(count, flush=True)
elif count % 5000 == 0 and VERBOSITY > 1:
print(count, flush=True)
if VERBOSITY:
print("ran predictions...", flush=True)
df = pd.DataFrame(nlp_out, columns=['ID', 'NLP_OUT'])
score_df = df.apply(convert_dict_str_to_dict, axis=1, col_name="NLP_OUT", result_type="expand")
df = pd.concat([df, score_df], axis=1)
df['y_pred'] = df[["NO_FOLLOWUP", "HARD_FOLLOWUP", "CONDITIONAL_FOLLOWUP"]].idxmax(axis=1)
x = data_in.merge(df, on="ID", how='left')
print(x['y_pred'].value_counts(normalize=True))
x = x.drop("NLP_OUT", axis = 1)
fname, ext = os.path.splitext(INPUT_FILE)
if ext[1:] == "tsv":
x.to_csv(fname + "_predictions1" + ext, index=False, sep="\t")
elif ext[1:] == "xls" or ext[1:] == "xlsx":
x.to_excel(fname + "_predictions1" + ext, index=False)
elif ext[1:] == "csv":
x.to_csv(fname + "_predictions1" + ext, index=False)
else:
x.to_csv(fname + "_predictions1.csv", index=False)