-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
114 lines (101 loc) · 3.17 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
## Environment variables
from os.path import join as pjoin
def cnf(name, val):
globals()[name] = config.setdefault(name, val)
# Intermediate dirs
cnf("WORK", "work")
cnf("WIKIPAGES", WORK + "/pages")
cnf("MOD_DATA", WORK + "/mod_data")
cnf("FSTS", WORK + "/fsts")
cnf("PARSED", WORK + "/parsed")
cnf("STATS", WORK + "/stats")
cnf("LOG", WORK + "/log")
# Input
WIKIDUMP = config["WIKIDUMP"]
rule all:
input:
defns_db = WORK + "/defns.db",
agg_csv = STATS + "/stats.csv"
rule make_dumpsplit:
output:
"dumpsplit/target/release/dumpsplit"
shell:
"cd dumpsplit && ./build.sh"
rule run_dumpsplit:
input:
dumpsplit = "dumpsplit/target/release/dumpsplit",
wikidump = WIKIDUMP
output:
wikipages = directory(WIKIPAGES)
shell:
"mkdir -p {output.wikipages}" +
" && lbunzip2 -c {input.wikidump} | {input.dumpsplit} {output.wikipages}"
rule clone_scribunto:
output:
directory("dumplabels/Scribunto")
shell:
"cd dumplabels && git clone https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto"
rule run_dumplabels:
input:
wikipages = WIKIPAGES,
scribunto = "dumplabels/Scribunto"
output:
mod_data = directory(MOD_DATA)
shell:
"mkdir -p " + MOD_DATA +
" && MOD_DUMP_PATH={input.wikipages}/modules luajit dumplabels/dump_labels.lua" +
" > {output.mod_data}/labels.json" +
" && python dumplabels/non_gram.py" +
" {output.mod_data}/labels.json" +
" {output.mod_data}/non_gram.json" +
" {output.mod_data}/pos_categories.json"
rule run_makefsts:
input:
mod_data = MOD_DATA,
output:
fsts = directory(FSTS)
shell:
"mkdir -p " + FSTS +
" && python parse.py make-fsts" +
" --mod-data {input.mod_data}"
" {output.fsts}"
rule run_wikiparse:
input:
mod_data = MOD_DATA,
wikipages = WIKIPAGES,
fsts = FSTS
output:
parsed = directory(PARSED),
stats_db = STATS + "/stats.db"
log:
LOG + "/wikiparse.log"
shell:
"mkdir -p {output.parsed}" +
" && python parse.py parse-pages {input.wikipages}/fin"
" --outdir {output.parsed} --stats-db {output.stats_db}"
" --fsts-dir {input.fsts}"
" > {log} 2>&1"
rule insert_wikiparse:
input:
parsed = PARSED,
output:
defns_db = WORK + "/defns.db"
shell:
"export DATABASE_URL=sqlite:///{output.defns_db};" +
" python parse.py create" +
" && python parse.py insert-dir {input.parsed}"
rule proc_stats:
input:
stats_db = STATS + "/stats.db",
defns_db = WORK + "/defns.db"
output:
agg_csv = STATS + "/stats.csv",
cov = STATS + "/cov.txt",
probs = STATS + "/probs.txt"
shell:
"export DATABASE_URL=sqlite:///{input.defns_db};" +
" python parse.py parse-stats-agg {input.stats_db} {output.agg_csv}" +
" && python parse.py parse-stats-cov --insert {output.agg_csv} > {output.cov}" +
" && python parse.py parse-stats-probs {output.agg_csv} > {output.probs}"
onsuccess:
shell("cp {log} " + LOG + "/snakemake.log")