-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMerge_TASS_data.py
35 lines (31 loc) · 1.24 KB
/
Merge_TASS_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# imports
import pandas as pd
import glob
import os
import xml.etree.ElementTree as ET
# Function for xml datasets extraction
def xml_extraction(path):
pdFrame = pd.DataFrame({'ID':[], 'Text':[],'Tag':[]})
row=0
for filepath in glob.glob(os.path.join(path, '*.xml')):
print(filepath)
tree = ET.parse(filepath)
eroot = tree.getroot() # the eroot of the complete tree transformed xml
# # turn this tree represeentation of the xml into a dataframe
for tweet in eroot:
tweet_id = 'ID:'+tweet.find('tweetid').text
tweetText = tweet.find('content').text
lang = tweet.find('lang').text
polarity_value = tweet.find('sentiment').find('polarity').find('value').text
if lang == 'es':
pdFrame.loc[row] = [tweet_id,tweetText,polarity_value]
row+=1
return pdFrame
# Applying function to TASS files
tass2019 = xml_extraction("/TASS2019")
tass2012 = xml_extraction("/TASS2012")
tass2020 = pd.read_csv("TASS2020/TASS2020.csv", encoding='utf8').reset_index(drop=True)
# Join all TASS datasets
AllTassDf = pd.concat([tass2012, tass2019, tass2020], ignore_index=True)
# Exporting dataframe as csv file
tassDf.to_csv("ALLdTassDF.csv")