-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpardocs.go
96 lines (80 loc) · 2.76 KB
/
pardocs.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package pardocs // import "github.com/Sinar/go-pardocs"
import (
"log"
"github.com/Sinar/go-pardocs/internal/hansard"
)
type ParliamentDocs struct {
Conf Configuration
}
// CommandMode specifies the operation being executed.
type CommandMode int
// The available commands.
const (
PLAN CommandMode = iota
SPLIT
RESET
)
// Configuration of a Context.
type Configuration struct {
// Parliament Session Label
ParliamentSession string
// Hansard Type
HansardType hansard.HansardType
// ./raw + ./data folders are assumed to be relative to this dir
WorkingDir string
// Source PDF can be anywhere; maybe make it a Reader to be read direct from S3?
SourcePDFPath string
// Command being executed.
Cmd CommandMode
}
func (pd *ParliamentDocs) Plan() {
log.Println("In Plan ..")
pdfPath := pd.Conf.SourcePDFPath
// Extract out hansard.MaxLineProcessed lines from each page to be analyzed
pdfDoc, err := hansard.NewPDFDoc(pdfPath)
if err != nil {
log.Fatal(err)
}
// Sanity check before proceeding ..
if len(pdfDoc.Pages) < 1 {
log.Fatal("Could NOT find any pages!")
}
// Analyze the Hansard Document to find the question split
hansardDoc, _ := hansard.NewHansardDocument(pdfPath)
for _, p := range pdfDoc.Pages {
//log.Println("PAGE:", p.PageNo)
// Detect question
dterr := hansardDoc.ProcessLinesExcerpt(p.PageNo, p.PDFTxtSameLines)
if dterr != nil {
log.Fatal(dterr)
}
}
// Wrap up processing; what if there is no pages?
hansardDoc.Finalize()
// TODO: Better refactoring somewhere else? looks like a bit of a hack ..
hansardDoc.ParliamentSession = pd.Conf.ParliamentSession // Mis-naming? is this the right place to place this?
hansardDoc.HansardType = pd.Conf.HansardType
// Persist the plan
hansard.SavePlan(pd.Conf.HansardType, pd.Conf.WorkingDir, pd.Conf.SourcePDFPath, hansardDoc)
//sessionName, hansardType := getParliamentDocMetadata(pdfPath, pd.Conf.HansardType)
//hansardDoc.PersistForSplit(fmt.Sprintf("%s/data/%s/%s", pd.Conf.WorkingDir, hansardType, sessionName))
}
func (pd *ParliamentDocs) Split() {
log.Println("In Split ..")
// Load plan
//sessionName, hansardType := getParliamentDocMetadata(pd.Conf.SourcePDFPath, pd.Conf.HansardType)
//planLocation := fmt.Sprintf("%s/data/%s/%s/split.yml", pd.Conf.WorkingDir, hansardType, sessionName)
plan := hansard.LoadSplitHansardDocPlanFromFile(pd.Conf.HansardType, pd.Conf.WorkingDir, pd.Conf.SourcePDFPath)
// Get the struct
shdp := hansard.NewSplitHansardDocumentPlan(pd.Conf.HansardType, pd.Conf.WorkingDir, pd.Conf.SourcePDFPath)
// Execute!
for _, hq := range plan.HansardQuestions {
shdp.ExecuteSplit(pd.Conf.ParliamentSession, hq)
}
}
func (pd *ParliamentDocs) Reset() {
log.Println("In Reset ...")
// Clean up plan
// Clean up split pages folder
// Clean up merged pages location
}