-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnalyzeDataFrame.py
113 lines (92 loc) · 3.04 KB
/
AnalyzeDataFrame.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/python
import sys
import getopt
import pandas as pd
# Retrieve the attributes of the samples for a specified BioProject and summarize
# the values of those attributes across the project.
# The intent is to provide a report which will help a user
# 1. Identify the data model of the study
# 2. Identify if the project uses a template similar to other studies
def DataFrameAttributes(path):
print ('____________________________________')
print ('Attribute details for file: ' + path)
df = pd.read_csv(path)
columns = list(df)
# Summarize the attributes for this bioproject
uniques = []
almostUniques = []
constants = []
singleValue = []
rowCount = df.shape[0]
print ('No of rows:' + str(rowCount))
print('____________________________________')
print( 'The following attributes vary across row.')
print ( 'Some may indicate the dataset design/model.')
print ( 'Some may be sample/subject observations/measurements/data elements.')
print ()
# for aname, att in attDetails.items():
for c in columns:
#print('processing:{}'.format(c))
uniqueValueCount = df[c].nunique()
if uniqueValueCount == rowCount:
uniques.append(c)
#att['variability'] = 'u'
elif 100.0*uniqueValueCount/rowCount > 80.0:
almostUniques.append(c)
#att['variability'] = 'au'
elif uniqueValueCount == 1:
if df[c].count == rowCount:
constants.append(c)
#att['variability'] = 'c'
else:
singleValue.append(c)
#att['variability'] = 's'
else:
#att['variability'] = 'v'
print ('Column:' + c )
print ('value\trowCount')
counts = df[c].value_counts().to_dict()
for k,v in counts.items():
print('{}\t{}'.format(k,v))
print ('____________________________________')
if len(uniques):
print ( 'The following attributes have a unique value for each row. ')
print ( 'They are therefore likely to some kind of identifier.')
for a in uniques:
print (a)
print ('____________________________________')
if len(almostUniques):
print ( 'The following attributes have a unique value for more than 80% of rows.')
print ( 'They are often a subject identifier.')
for a in almostUniques:
print (a)
print ('____________________________________')
if len(constants):
print ( 'The following have the same value for all samples.')
print ( 'They are likely to be an attribute of the dataset rather than the row')
for a in constants :
print (a)
print ('____________________________________')
if len(singleValue):
print ( 'The following have only one value in the dataset')
print ( 'but the attribute is not present for all rows')
for a in singleValue:
print (a)
print ('____________________________________')
def usage():
print (sys.argv[0] +' -f csvfilepath')
def main(argv):
try:
opts, args = getopt.getopt(argv, "hf:", ["help", "file="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-f", "--file"):
filepath = arg
DataFrameAttributes(filepath)
if __name__ == "__main__":
main(sys.argv[1:])