This repository has been archived by the owner on Apr 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparse.rb
132 lines (108 loc) · 4.03 KB
/
parse.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# encoding: UTF-8
# Extract vaccination data from official PDF reports, already converted to txt
require 'csv'
# Get the text lines containing the data in the given file
def get_data_table(filename)
lines = []
in_data_table = false
File.readlines(filename).each do |line|
# Check for table start, do nothing until then
if line.strip =~ /^Andalucía/
in_data_table = true
end
next unless in_data_table
# Stop when the data table ends
return lines if line=~/Fuente\: AEMPS\. El reparto de dosis/
# Keep the data tables lines
lines.push(line)
end
lines
end
# Get the data points from a bunch of text lines.
def extract_data(lines, filename)
filename =~ /(\d{4})(\d{2})(\d{2})\.txt/
report_date = "#{$1}#{$2}#{$3}" # Date in ISO format, handy to sort and test
formatted_date = "#{$3}/#{$2}/#{$1}"
# Extract data points per line
lines.each do |line|
line.strip!
next if line==''
# The PDF to text conversion is good enough with the -table option,
# so we just look for multiple spaces to split.
columns = line.split(/ +/)
# Remove some footnotes for consistency across days
columns[0].gsub!(' (*)', '')
columns[0].gsub!(' (**)', '')
columns[0].gsub!('*', '')
# Castilla La Mancha sometimes breaks across lines
columns[0] = 'Castilla La Mancha' if columns[0]=='Castilla La'
next if columns[0]=='Mancha'
# The first three reports were inconsistent about the type and number
# of dates provided. Things have settled now (20210114), but we need
# to fix one particular day.
columns.delete_at(4) if report_date=='20210105'
# Starting 20210114, we get data for a second vaccine (Moderna), not just Pfizer
if report_date<'20210114'
columns.insert(2, columns[1]) # The Pfizer doses equal the total doses
columns.insert(2, nil) # Add a blank column for Moderna
end
# Starting 20210209, we get data for a third vaccine (AstraZeneca)
if report_date<'20210209'
columns.insert(3, nil) # Add a blank column for AstraZeneca
end
# Starting 20210422, we get data for a fourth vaccine
if report_date<'20210422'
columns.insert(4, nil) # Add a blank column for Janssen
end
# Starting 20210118, we get data for # people with completed treatment
if report_date<'20210118'
columns.insert(8, nil) # Add a blank column, no one had two doses
end
# Starting 20210406, we get data for # people with one dose
if report_date<'20210406'
# Add a blank column, although this could be deduced from the other columns,
# since all the vaccines needed two doses. I.e.
# Total doses - 2 * fully vaccinated = people with one dose
columns.insert(8, nil)
end
# Starting 20211006, we get data for # people with additional doses
if report_date<'20211006'
columns.insert(10, nil)
end
# Starting 20211118, we get data for % people with incomplete treatments
if report_date<'20211118'
columns.insert(10, nil)
end
# Starting 20211220, we get data for pediatric vaccines
if report_date<'20211220'
columns.insert(2, nil)
end
# The summary line doesn't have a date at the end, which makes sense.
# Github doesn't like that, so we just add an empty cell to make
# Github's web preview work well.
columns.push(nil) if columns[0]=='Totales'
# And output the result of all this.
puts CSV::generate_line([formatted_date, columns].flatten)
end
end
# Go through all the available reports and print data as CSV
puts CSV::generate_line([
'informe',
'comunidad autónoma',
'dosis Pfizer',
'dosis Pfizer pediátrica',
'dosis Moderna',
'dosis AstraZeneca',
'dosis Janssen',
'dosis entregadas',
'dosis administradas',
'% sobre entregadas',
'personas con al menos una dosis',
'personas con pauta completa',
'% pautas incompletas',
'personas con dosis adicionales',
'última vacuna registrada'
])
Dir['reports/*txt'].sort.each do |filename|
extract_data(get_data_table(filename), filename)
end