forked from christianmarechal/searchfirstgoodcsvline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSearchFirstGoodVanilla.py
181 lines (140 loc) · 4.23 KB
/
SearchFirstGoodVanilla.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 20:08:07 2018
@author: christian marechal
Recherche de la premiere ligne de csv interessante
correspondante a la regularite d un tableau exploitable
"""
import os
path = 'D:/ecomdataforgoodfr/PushMyGIT'
os.chdir(path)
import csv
def readData(fileName):
f = open(fileName, "r", encoding="utf-8")
data = f.readlines()
f.close()
return data
def readData2(fileName):
f = open(fileName, "r")
data = f.readlines()
f.close()
return data
#recherche de la premiere regularite dans un ficchier cvs
#csvlignes = premieres lignes d un fichier csv
#limitemax = profondeur maximale de l exploration
#traceon
#Version Vanilla
import collections
# ne pas appeler directement / subroutine de vanilla2
def searFirstCSVlineGoodVanilla (csvlignes, limitemax, traceon):
premiereLigne=0
n = 0
separators = [',',';','\t']
df = []
frequence= []
# 1) boucle de lecture des premieres lignes
for ligne in csvlignes:
tab = []
for x in separators:
co = ligne.count(x)
tab.append(co)
frequence.append (tab.index(max(tab)))
df.append (tab)
n = n + 1
if (n >= limitemax):
break
# 2) Analyse du resultat
# a-detection du separateur
mx = max(collections.Counter(frequence))
#la colonne qui nous interesse
df2 = [row[mx] for row in df]
df2cout = collections.Counter(df2)
df2cout2 = df2cout.most_common()
mymax=0
mymaxid=0
for row in df2cout2 :
if row[1] >= mymax:
mymax=row[1]
mymaxid=row[0]
#recherche de la premiere ligne
i=0
premiereLigne=0
bfound1=False
for row in df2:
if (bfound1==True): #a minima deux lignes successives
if (row==mymaxid) :
break
else:
bfound1=False
if (bfound1==False) and (row==mymaxid) :
premiereLigne=i
bfound1=True
i= i+1
if (traceon):
print (df)
spar=[",", ";","tabulation"]
print("Separateur=["+spar[mx]+"], premiere ligne="+str(premiereLigne))
return (premiereLigne)
def searFirstCSVlineGoodVanilla2 (csvlignes, limitemax, traceon):
cas0=searFirstCSVlineGoodVanilla (csvlignes, limitemax, traceon)
if (cas0 > 0):
return (cas0)
#sinon
n = 0
separators = [',',';','\t']
df = []
frequence= []
# 1) boucle de lecture des premieres lignes
for ligne in csvlignes:
tab = []
for x in separators:
co = ligne.count(x)
tab.append(co)
frequence.append (tab.index(max(tab)))
df.append (tab)
n = n + 1
if (n >= limitemax):
break
# 2) Analyse du resultat
mx = max(collections.Counter(frequence)) #separateur
nombre = len (frequence)
total=0
n=0
df2 = []
for ligne in csvlignes:
tab = []
co = ligne.split (separators [mx])
co = [x for x in co if x] #remove empty string
a2 = len (co)
total = total + a2
df2.append (a2)
n = n + 1
if (n >= limitemax):
break
moyenne = round (total / nombre) -1
premiereLigne=0
i=0
bfound1=False
for x in df2:
if (bfound1==True): #a minima deux lignes successives
if (x >= moyenne):
break
if (bfound1==False) and (x >= moyenne):
premiereLigne=i
bfound1=True
i= i+1
if (traceon):
print (df)
spar=[",", ";","tabulation"]
print("Separateur=["+spar[mx]+"], premiere ligne="+str(premiereLigne))
return (premiereLigne)
limitemax=100
traceon=False
csvfile='test.csv'
csvlignes = readData(csvfile)
numero = searFirstCSVlineGoodVanilla2 (csvlignes, limitemax, traceon)
print("Premiere ligne->"+str(numero))
csvfile='test2.csv'
csvlignes = readData2(csvfile)
numero = searFirstCSVlineGoodVanilla2 (csvlignes, limitemax, traceon)
print("Premiere ligne->"+str(numero))