-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstructFunctions.R
101 lines (99 loc) · 4.72 KB
/
constructFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
startBuildHeading <- function(lineText){
lineText <- sub("^u\t","\t", lineText)
lineText <- sub("^T\t","\t", lineText)
lineText <- sub("^S\t","\t", lineText)
lineText <- sub(" \t","\t",lineText)
tariffNumber <- paste(substr(lineText, 2,3), substr(lineText, 5,6), sep="")
lineText <- sub("^\t+[0-9]{2}\\.[0-9]{2}","", lineText)
lineText <- sub("^\t+","",lineText)
lineText <- sub("^(\\s\t)+","",lineText)
lineText <- sub("\\.\t",". ",lineText)
result <- list(tariffNumber, lineText)
return(result)
}
startBuildHS8 <- function(lineText){
lineText <- sub("^u\t","\t", lineText)
lineText <- sub("^T\t","\t", lineText)
lineText <- sub("^S\t","\t", lineText)
lineText <- sub(" \t","\t",lineText) # Tidy leading spaces
lineText <- sub("^(?=\\d)","\t", lineText, perl=TRUE) #should be a leading tab for tariff code numbering
tariffNumber <- paste(substr(lineText, 2,5),
substr(lineText, 7,8),
substr(lineText, 10,11), sep="")
lineText <- sub("^\t[0-9]{4}\\.[0-9]{2}\\.[0-9]{2}\t","",lineText) # Delete number off front of string
lineText <- sub("[0-9]{2}[A-Z]\t","",lineText) # Delete stats code off front of string
lineText <- sub("(^[A-z]+\t)|(^ [A-z]+\t)","",lineText)
lineText <- sub("(^\\sNo\\.\t)|(^No\\.\t)","",lineText)
lineText <- sub("(^m2\t)|(^ m2\t)","",lineText)
lineText <- sub("(^m?\t)|(^ m?\t)","",lineText)
lineText <- sub("(^m²\t)|(^ m²\t)|(^\tm²\t)","",lineText)
lineText <- sub("(^m3\t)|(^ m3\t)","",lineText)
lineText <- sub("(^m?\t)|(^ m?\t)","",lineText)
lineText <- sub("(^\\.\\.\t)|(^ \\.\\.\t)|(^\t\\.\\.\t)","",lineText)
lineText <- sub("^([a-z] [a-z]{2})+\t","",lineText)
lineText <- sub("^\\.+\t","",lineText)
lineText <- sub("^\t+","",lineText)
lineText <- sub("^( \t)+","",lineText)
lineText <- sub("\\.\t","\t",lineText) # TRY THIS
lineText <- sub("\t+$","\t",lineText)
lineText <- sub("\\s+$","",lineText)
lineText <- sub("\t5\tFree$", "", lineText)
lineText <- sub("\t10\tFree$", "", lineText)
lineText <- sub("\tFree\tFree$", "", lineText)
# Will need to manually remove the double hyphens I think e.g. 1604.13.01 Salmon and brisling.
# Excel deals with them gracefully
result <- list(tariffNumber, lineText)
return(result)
}
startSubhead <- function(lineText){
lineText <- sub("^u\t","\t", lineText)
lineText <- sub("^T\t","\t", lineText)
lineText <- sub("^S\t","\t", lineText)
if(grepl("^\t[0-9]", lineText)){
tariffNumber <- paste(substr(lineText, 2,5),
substr(lineText, 7,8), sep="")
}
else{
tariffNumber <- 'NA'
}
lineText <- sub("^\t[0-9]{4}\\.[0-9]{2}\t","",lineText)
lineText <- sub("[0-9]{2}[A-Z]\t","",lineText)
lineText <- sub("^[a-z]+\t","",lineText)
lineText <- sub("^([a-z] [a-z]{2})+\t","",lineText)
lineText <- sub("(^\\sNo\\.\t)|(^No\\.\t)","",lineText)
# lineText <- sub("\\s\t","\t",lineText)
lineText <- sub("^\t+","",lineText)
lineText <- sub("\t+","",lineText)
lineText <- sub("\\.\t",". ",lineText)
lineText <- sub("^\\s+","",lineText)
lineText <- sub("\\s+$","",lineText)
result <- list(tariffNumber, lineText)
return(result)
}
startStats <- function(lineText, dots_level){
tariffNumber <- str_extract(lineText, "[0-9]{2}[A-Z]")
# Strip off tariff code, if this line has one
lineText <- str_replace(lineText, "^.*\t+.*[0-9][0-9][0-9][0-9]\\.[0-9][0-9]\\.[0-9][0-9]\t\\s*","")
# Strip off the stats key
lineText <- sub("\t*[0-9]{2}[A-Z]","",lineText) # Delete stats code off front of string
lineText <- sub("^\t*\\s*No\\.", "", lineText)
lineText <- sub("^\t*kg\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*pr\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*m3\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*m2\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*M2\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*tne\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*\\.\\.\\s*\t+\\s*", "", lineText)
lineText <- sub("^\t*", "", lineText)
# lineText <- ifelse(dots_level == 8, sub("(\\.\\s+){8}","", lineText),
# ifelse(dots_level == 7, sub("(\\.\\s+){7}","", lineText),
# ifelse(dots_level == 6, sub("(\\.\\s+){6}","", lineText),
# ifelse(dots_level == 5, sub("(\\.\\s+){5}","", lineText),
# ifelse(dots_level == 4, sub("(\\.\\s+){4}","", lineText),
# ifelse(dots_level == 3, sub("(\\.\\s+){3}","", lineText),
# ifelse(dots_level == 2, sub("(\\.\\s+){2}","", lineText),
# ifelse(dots_level == 1, sub("(\\.\\s+){1}","", lineText),
# lineText))))))))
result <- list(tariffNumber, lineText)
return(result)
}