-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtestDocx.html
74 lines (61 loc) · 2.94 KB
/
testDocx.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
<!-- open -a 'Google Chrome' --args -allow-file-access-from-files -->
<body>
<button onclick="gettext()">Get document text</button>
</body>
<script src="https://unpkg.com/pizzip@3.1.1/dist/pizzip.js"></script>
<script src="https://unpkg.com/pizzip@3.1.1/dist/pizzip-utils.js"></script>
<script>
// Will use https://developer.mozilla.org/en-US/docs/Web/API/FileReader/readAsDataURL
function loadFile(url, callback) {
PizZipUtils.getBinaryContent(url, callback);
}
// First 3 paras are "Transcript\n 13 March 2024\nXXX started transcription\n
// So we ignore all short paragraphs.
function transcriptLineFromDocxPara(para) {
// Answers the NVivo transcript line from the given <w:p> document element.
// Else empty string if it is too short for a transcript line.
const textsXml = Array.from(para.querySelectorAll("t, br"));
// Gives w:br, w:t (Speaker), w:t (Timestamp), w:br
// Followed by content w:t and w:br elements.
// Want Timestamp\tSpeaker\tContent---Content Content...
return (textsXml.length < 5) ? "" : // Ignore short lines
textsXml[2].textContent + "\t" // Timestamp
+ textsXml[1].textContent + "\t" // Speaker
+ textsXml.slice(4).map( // Content
node => node.tagName.includes("br")
? "---" : node.textContent + " ")
.join('')
.trimEnd() // Don't want last space.
+ "\n";
}
function transcriptFromDocx(content) {
// Answers the NVivo transcript from the given Docx content as a binary string.
// A Docx file is a zipped archive containing several files, but the one that matters
// is 'word/document.xml', an xml file which contains paragraphs <w:p>,
// each containing text <w:t> and line end <w:br> elements.
// There was code to strip out an initial byte order mark(BOM). A BOM is a special marker
// at the beginning of a file that indicates the byte order of the text data in the file.
// .charCodeAt(0) === 65279 , but these files don't need that.
const zip = new PizZip(content);
const xml = new DOMParser().parseFromString(
zip.files["word/document.xml"].asText(),
"text/xml");
return "Timespan\tSpeaker\tContent\n"
+ Array.from(xml.getElementsByTagName("w:p"))
.map(transcriptLineFromDocxPara)
.join("");
}
function gettext() {
loadFile(
"Example.docx",
function (error, content) {
if (error) {
alert( error );
}
var text = transcriptFromDocx(content);
console.log(text);
alert(text);
}
);
}
</script>