forked from ufal/hamledt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpopulate_ud21.pl
executable file
·129 lines (123 loc) · 5.55 KB
/
populate_ud21.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env perl
# Creates folders and Makefiles for a new release of Universal Dependencies.
# Copyright © 2016, 2017 Dan Zeman <zeman@ufal.mff.cuni.cz>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
# The udlib package is versioned in the UD tools repository.
use lib '/net/work/people/zeman/unidep/tools';
use udlib;
my $RELEASE = 2.1; # number to compare with the first release number in READMEs
my $CRELEASE = '21'; # compact string for HamleDT path, e.g. '14' for 'cs-ud14' (Czech in release 1.4)
# We do not take the official release in /net/data because we test wether the UD_ folders are git repositories.
###!!! 14.11.2017 DZ: Ale nějak mi teď není jasné, proč to testujeme. Je k něčemu potřeba, abychom data brali z gitového repozitáře?
###!!! Jedno možné vysvětlení, proč jsem se rozhodl brát data z /net/work/people/zeman/unidep místo z /net/data:
###!!! Pokud se rozhodnu některé treebanky vylepšovat blokem FixUD a pokud to dělám v několika kolech třeba s mezilehlou ruční editací,
###!!! budu asi chtít, aby source path v Makefilu ukazovala na /net/work/people/zeman/unidep, abych si při druhém a dalším make source
###!!! bral data upravená v minulých kolech a ne ta, která byla součástí posledního vydání.
###!!! Ale pokud je tohle jediný důvod, tak možná nestojí za to. Nanejvýš bych mohl alternativní cestu do Makefilu vygenerovat
###!!! do jiné proměnné, aby byla v případě potřeby po ruce.
my $udpath = '/net/work/people/zeman/unidep';
my @folders = udlib::list_ud_folders($udpath);
print("Found ", scalar(@folders), " UD folders in $udpath.\n");
my @hamledtfolders;
foreach my $folder (@folders)
{
my $record = udlib::get_ud_files_and_codes($folder, $udpath);
# Skip folders without data.
next if(!defined($record->{lcode}));
# The name of the folder: 'UD_' + language name + optional treebank identifier.
# Example: UD_Ancient_Greek-PROIEL
my $language = $record->{lname};
$language =~ s/ /_/g;
my $treebank = $record->{tname};
my $langcode = $record->{lcode};
my $udname = $record->{name};
# Read the README file first. We need to know whether this repository is scheduled for the upcoming release.
my $metadata = udlib::read_readme("$udpath/$folder");
if($metadata->{firstrelease}<=$RELEASE)
{
# Look for the other files in the repository.
opendir(DIR, "$udpath/$folder") or die("Cannot read the contents of the folder $udpath/$folder");
my @files = readdir(DIR);
my @conllufiles = grep {-f "$udpath/$folder/$_" && m/\.conllu$/} (@files);
my $n = scalar(@conllufiles);
# Only process folders that are git repositories and contain CoNLL-U files.
if($n > 0 && -d "$udpath/$folder/.git")
{
my $lctreebank = $record->{tcode};
my $key = $record->{code};
my $hfolder = "$langcode-ud$CRELEASE$lctreebank";
push(@hamledtfolders, $hfolder);
print("$folder --> $hfolder\n");
if(1) # can be switched off for dry runs
{
my $hpath = "/net/work/people/zeman/hamledt/normalize/$hfolder";
system("mkdir -p $hpath");
my $makefile = <<EOF
LANGCODE=$langcode
TREEBANK=$hfolder
UDCODE=$key
UDNAME=$udname
include ../common.mak
SOURCEDIR=/net/work/people/zeman/unidep/UD_\$(UDNAME)
source:
EOF
;
if(-f "$udpath/$folder/$key-ud-train.conllu")
{
$makefile .= "\tcp \$(SOURCEDIR)/\$(UDCODE)-ud-train.conllu data/source/train.conllu\n";
}
if(-f "$udpath/$folder/$key-ud-dev.conllu")
{
$makefile .= "\tcp \$(SOURCEDIR)/\$(UDCODE)-ud-dev.conllu data/source/dev.conllu\n";
}
if(-f "$udpath/$folder/$key-ud-test.conllu")
{
$makefile .= "\tcp \$(SOURCEDIR)/\$(UDCODE)-ud-test.conllu data/source/test.conllu\n";
}
$makefile .= <<EOF
# Do not convert Universal Dependencies to the Prague style and then back to UD. Instead, read directly UD.
# Note that there will be just one tree per sentence, not three. (There are three trees per sentence for treebanks that are converted via Prague.)
ud: conllu_to_treex
EOF
;
open(MAKEFILE, ">$hpath/Makefile") or die("Cannot write to Makefile: $!");
print MAKEFILE ($makefile);
close(MAKEFILE);
system("cd $hpath ; git add Makefile ; make dirs ; make source ; make ud");
}
}
closedir(DIR);
}
}
# The Makefile in $HAMLEDT/normalize needs a list of all treebanks in the current UD release.
# Generate the list so that it does not have to be typed manually.
my $makefile_path = '/net/work/people/zeman/hamledt/normalize/Makefile';
my $makefile_contents;
open(MAKEFILE, $makefile_path) or die("Cannot read $makefile_path: $!");
while(<MAKEFILE>)
{
if(m/^TREEBANKS_UD$CRELEASE\s*=/)
{
# Discard the previous list, if any.
}
elsif(m/^TREEBANKS\s*=/)
{
my $list = "TREEBANKS_UD$CRELEASE = ".join(' ', @hamledtfolders)."\n";
print STDERR ($list);
$makefile_contents .= $list;
$makefile_contents .= "TREEBANKS = \$(TREEBANKS_UD$CRELEASE)\n";
}
else
{
$makefile_contents .= $_;
}
}
close(MAKEFILE);
open(MAKEFILE, ">$makefile_path") or die("Cannot write $makefile_path: $!");
print MAKEFILE ($makefile_contents);
close(MAKEFILE);