Skip to content

Commit

Permalink
Merge pull request #9 from congjian-wang/mandd/ER_schema
Browse files Browse the repository at this point in the history
Mandd/er schema
  • Loading branch information
wangcj05 authored and GitHub Enterprise committed Jun 12, 2024
2 parents b52c97f + 3bc0676 commit fd7bd31
Show file tree
Hide file tree
Showing 4 changed files with 819 additions and 95 deletions.
Binary file modified data/abbreviations.xlsx
Binary file not shown.
Binary file modified data/tag_keywords_lists.xlsx
Binary file not shown.
338 changes: 243 additions & 95 deletions others/tagKeywordListReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,100 +17,248 @@
tagsDictCleaned, acronymsDict = cleanTagDict(tagsDict)
'''

def keyWordListGenerator(fileName):
'''
Method designed to read the file and generate a dictionary which contains, for each tag,
the set of keywords that should be associate to such tag.
'''
# TODO: ==> check if lower, and lemma (e.g. for plural)!!!
# TODO: subsets of words
# read excel file .xlsx
df = pd.read_excel(fileName, None)
# retrieve list of sheets in excel file
sheet_list = df.keys()

tagsDict = {}
for sheet in sheet_list:
# retrieve columns of each sheet
cols = df[sheet].keys()
for col in cols:
# retrieve TAG of each column; it should be contained in square brackets [tag]
first = col.find("[")
second = col.find("]")
tagID = col[first+1:second]
keywordsList = df[sheet][col].dropna().values.tolist()
keywordsList = [[i] for i in keywordsList if i]
for index,keyword in enumerate(keywordsList):
if ',' in keyword[0]:
keywordsList[index] = keyword[0].split(',')
tagsDict[tagID] = list(itertools.chain(*keywordsList))
return tagsDict

def extractUnits(fileName):
'''
Method designed to extract measure units from provided file.
It returns a dictionary which contains, for each quantity, a list of common used units, e.g.,
{'Pressure': ['pa', ' torr', ' barr', ' atm', ' psi']}
'''
measuresDict = {}
df = pd.read_excel(fileName, None)
measures = df['operands'][['Properties [prop]','units [unit]']]
for index,elem in measures.iterrows():
if not pd.isnull(elem['units [unit]']):
measuresDict[elem['Properties [prop]']] = elem['units [unit]'].replace(" ", "").split(',')
return measuresDict

def cleanTagDict(tagsDict):
'''
Method designed to clean the dictionary generated by the method keyWordListGenerator(.)
Here, specific characters or sub strings are removed.
In addition, if an acronym is defined (within round parentheses), then the acronyms_dict is
populated {acronym: acronym_definition}
'''
acronymsDict = {}
n_keywords = 0
for tag in tagsDict:
for index,elem in enumerate(tagsDict[tag]):
# clean string
cleanElem = elem.lower()
cleanElem = cleanElem.strip().lstrip()
cleanElem = cleanElem.replace("\xa0", " ")
cleanElem = cleanElem.replace("\n", " ")
# Note that here we are removing the hyphen
cleanElem = cleanElem.replace("-", " ")

# retrieve acronym if defined
first = cleanElem.find("(")
second = cleanElem.find(")")
if (first==-1 and second>=0) or (second==-1 and first>=0):
print('Error of acronym definition')
if (first>=0 and second>=0):
acronym = cleanElem[first + 1:second].strip().lstrip()
to_replace = "(" + acronym + ")"
cleanElem = cleanElem.replace(to_replace,'')
cleanElem = " ".join(cleanElem.split())
# save acronym into its own dictionary
acronymsDict[acronym] = cleanElem.strip().lstrip()
# remove acronym from tags_dict
to_replace = "(" + acronym + ")"
cleanElem = cleanElem.replace(to_replace,'')
else:
cleanElem = cleanElem

tagsDict[tag][index] = " ".join(cleanElem.split()) # clean_elem
tagsDict[tag] = [i for i in tagsDict[tag] if i]

for tag in tagsDict:
n_keywords = n_keywords + len(tagsDict[tag])
print("Number of listed keywords: " + str(n_keywords))
tagsDictChecker(tagsDict)
return tagsDict, acronymsDict

def tagsDictChecker(tagsDict):
for key1 in tagsDict.keys():
for key2 in tagsDict.keys():
commonElements = list(set(tagsDict[key1]).intersection(tagsDict[key2]))
if key1!=key2 and commonElements:
print('Elements in common between ' +str(key1)+ ' and ' +str(key2)+ ' are:' + str(commonElements))
class ERschema():
"""
Class designed to codify the equipment reliability (ER) schema and classify content of a clause/sentence
"""

def __init__(self):
"""
Initialization method
Args:
None
Returns:
None
"""
self.matchDict = {}
self.matchDict['surv_tool'] = ['surv_tool']
self.matchDict['inspection'] = ['surv_ops','HS_neut']
self.matchDict['diagnosis'] = ['diagn']
self.matchDict['maintenance'] = ['mnt_ops']
self.matchDict['maint_tool'] = ['mnt_tool']
self.matchDict['location'] = ['arch']
self.matchDict['material'] = ['chem_cmpd','chem_elem','mat']
self.matchDict['reaction'] = ['chem_rx']
self.matchDict['env_agent'] = ['ext_agent']
self.matchDict['degradation'] = ['deg_mech']
self.matchDict['function'] = ['opd_elt','opd_hyd_pne']
self.matchDict['qual_asmnt'] = ['qual_asmnt']
self.matchDict['asset[anomalous]'] = ['HS_neg','fail_type_n','fail_type_v']
self.matchDict['asset[OK]'] = ['HS_pos']
self.matchDict['asset'] = ['comp_mech_fact',
'comp_mech_rot',
'comp_mech_struct',
'comp_mech_spec',
'comp_elt/n',
'comp_hyd/pne',
'ast_mech',
'ast_elt',
'ast_hyd_pne',
'ast_eln',
'ast_I&C',
'ast_fuel']

self.invMatchDict = {}
for key in self.matchDict.keys():
for elem in self.matchDict[key]:
self.invMatchDict[elem] = key

def returnERnature(self, labelList):
"""
Initialization method
Args:
labelList, list, list that contains labels identified in a text
Returns:
nature, list, list that contains the corresponding elements in the ER schema for each label contained in labelList
"""
nature = []
for elem in labelList:
nature.append(self.invMatchDict[elem])
return nature


class entityLibrary():
"""
Class designed to contain all nuclear related entities listed in nlp/data/tag_keywords_lists.xlsx
"""
def __init__(self,fileName):
"""
Initialization method
Args:
fileName, string, file containing nuclear related entities
Returns:
None
"""
self.library = self.keyWordListGenerator(fileName)
self.cleanTagDict()
self.expander()

def checker(self):
"""
Method designed to check the structure of the set of nuclear related entities and identify entities
that might share multiple labels
Args:
None
Returns:
None
"""
for key1 in self.library.keys():
for key2 in self.library.keys():
commonElements = list(set(self.library[key1]).intersection(self.library[key2]))
if key1!=key2 and commonElements:
print('Elements in common between ' +str(key1)+ ' and ' +str(key2)+ ' are:' + str(commonElements))

def getLibrary(self):
"""
Method designed to return self.library
Args:
None
Returns:
self.library, dict, dictionary containing for each label a list of entities
"""
return self.library

def getAcronymsDict(self):
"""
Method designed to return self.acronymsDict
Args:
None
Returns:
self.acronymsDict, dict, dictionary containing the acronyms contained in the library
"""
return self.acronymsDict

def expander(self):
"""
Method designed to treat those entities that are compounds of two words which are identified as word1-word2.
These compond words can be written in multiple forms: "word1-word2", "word1word2", "word1 word2".
Here, these forms are generated for each identified compund word (when '-' is identified in the entity)
Args:
None
Returns:
None
"""
for key in self.library.keys():
for elem in self.library[key]:
if '-' in elem:
self.library[key].append(elem.replace('-',''))
self.library[key].append(elem.replace('-',' '))


def keyWordListGenerator(self, fileName):
"""
Method designed to read the file and generate a dictionary which contains, for each tag,
the set of keywords that should be associate to such tag.
Args:
fileName, string, file containing nuclear related entities
Returns:
tagsDict, dict, dictionary containing for each label a list of entities
"""

df = pd.read_excel(fileName, None)
# retrieve list of sheets in excel file
sheetList = df.keys()

tagsDict = {}
for sheet in sheetList:
# retrieve columns of each sheet
cols = df[sheet].keys()
for col in cols:
# retrieve TAG of each column; it should be contained in square brackets [tag]
first = col.find("[")
second = col.find("]")
tagID = col[first+1:second]

if tagID not in ['prop','unit']:
keywordsList = df[sheet][col].dropna().values.tolist()
keywordsList = [[i] for i in keywordsList if i]
for index,keyword in enumerate(keywordsList):
if ',' in keyword[0]:
keywordsList[index] = keyword[0].split(',')
tagsDict[tagID] = list(itertools.chain(*keywordsList))
return tagsDict


def cleanTagDict(self):
"""
Method designed to clean the dictionary generated by the method keyWordListGenerator(.)
Here, specific characters or sub strings are removed.
In addition, if an acronym is defined (within round parentheses), then the acronyms_dict is
populated {acronym: acronym_definition}
Args:
None
Returns:
None
"""

self.acronymsDict = {}
n_keywords = 0
for tag in self.library:
for index,elem in enumerate(self.library[tag]):
# clean string
cleanElem = elem.lower()
cleanElem = cleanElem.strip().lstrip()
cleanElem = cleanElem.replace("\xa0", " ")
cleanElem = cleanElem.replace("\n", " ")

# retrieve acronym if defined
first = cleanElem.find("(")
second = cleanElem.find(")")
if (first==-1 and second>=0) or (second==-1 and first>=0):
print('Error of acronym definition')
if (first>=0 and second>=0):
acronym = cleanElem[first + 1:second].strip().lstrip()
toReplace = "(" + acronym + ")"
cleanElem = cleanElem.replace(toReplace,'')
cleanElem = " ".join(cleanElem.split())
# save acronym into its own dictionary
self.acronymsDict[acronym] = cleanElem.strip().lstrip()
# remove acronym from tags_dict
toReplace = "(" + acronym + ")"
cleanElem = cleanElem.replace(toReplace,'')
else:
cleanElem = cleanElem

self.library[tag][index] = " ".join(cleanElem.split()) # clean_elem
self.library[tag] = [i for i in self.library[tag] if i]

for tag in self.library:
nKeywords = n_keywords + len(self.library[tag])
print("Number of listed keywords: " + str(nKeywords))




Loading

0 comments on commit fd7bd31

Please sign in to comment.