testDocx.html

<!-- open -a 'Google Chrome' --args -allow-file-access-from-files -->
<body>
    <button onclick="gettext()">Get document text</button>
</body>
<script src="https://unpkg.com/pizzip@3.1.1/dist/pizzip.js"></script>
<script src="https://unpkg.com/pizzip@3.1.1/dist/pizzip-utils.js"></script>
<script>
    
    // Will use https://developer.mozilla.org/en-US/docs/Web/API/FileReader/readAsDataURL
    function loadFile(url, callback) {
        PizZipUtils.getBinaryContent(url, callback);
    }
    // First 3 paras are "Transcript\n 13 March 2024\nXXX started transcription\n
    // So we ignore all short paragraphs.
    
    function transcriptLineFromDocxPara(para) {
        // Answers the NVivo transcript line from the given <w:p> document element.
        // Else empty string if it is too short for a transcript line.
        
        const textsXml = Array.from(para.querySelectorAll("t, br"));
        
        // Gives w:br, w:t (Speaker), w:t (Timestamp), w:br
        // Followed by content w:t and w:br elements.
        
        // Want Timestamp\tSpeaker\tContent---Content Content...
        
        return (textsXml.length < 5) ? "" :         // Ignore short lines
            textsXml[2].textContent + "\t"          // Timestamp
            + textsXml[1].textContent + "\t"        // Speaker
            + textsXml.slice(4).map(                // Content
                    node => node.tagName.includes("br")
                        ? "---" : node.textContent + " ")
                .join('')
                .trimEnd() // Don't want last space.
            + "\n";
    }
    
    function transcriptFromDocx(content) {
        // Answers the NVivo transcript from the given Docx content as a binary string.
        
        // A Docx file is a zipped archive containing several files, but the one that matters
        // is 'word/document.xml', an xml file which contains paragraphs <w:p>,
        // each containing text <w:t> and line end <w:br> elements.
        
        // There was code to strip out an initial byte order mark(BOM). A BOM is a special marker
        // at the beginning of a file that indicates the byte order of the text data in the file.
        // .charCodeAt(0) === 65279 , but these files don't need that.
        
        const zip = new PizZip(content);
        const xml = new DOMParser().parseFromString(
              zip.files["word/document.xml"].asText(),
              "text/xml");
        return "Timespan\tSpeaker\tContent\n"
            + Array.from(xml.getElementsByTagName("w:p"))
                .map(transcriptLineFromDocxPara)
                .join("");
    }
    
    function gettext() {
        loadFile(
             "Example.docx",
             function (error, content) {
                 if (error) {
                     alert( error );
                 }
                 
                 var text = transcriptFromDocx(content);
                 console.log(text);
                 alert(text);
             }
         );
    }
</script>