Skip to content

Commit

Permalink
NEW: support KWIC format (#19)
Browse files Browse the repository at this point in the history
closes #19
  • Loading branch information
dwhieb authored Sep 28, 2019
1 parent 2de0a17 commit b9ad470
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 56 deletions.
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,38 @@ text | utterance | word | pre | token | post

**NOTE:** _This project is still in initial development phases, but should be ready for initial release by the end of September 2019._

## Basic Usage

This following examples process any JSON files in the current directory and output a concordance file to `concordance.tsv` in Keyword in Context format. At a minimum, the concordance function requires a single argument: a wordform or list of wordforms to concordance.

As a module:

```js
const concordance = require(`concordance`)

const wordforms = [`little`, `big`];

concordance(wordforms);
```

On the command line:

```cmd
dlx-conc -k little,big
```

**Note:** When using this library from the command line, Keyword in Context format is _not_ enabled by default. It must be enabled by passing the `-k` or `--kwic` flag.

## Options

The available options are listed below.

Module | Command Line | Default | Description
------------ | ------------------ | ------------------- | -----------
`dir` | `-d, --dir` | `"."` | the directory where the corpus is located
`KWIC` | `-k, --KWIC` | `false` | whether to create the concordance in Keyword in Context format; adds `pre` and `post` columns to the concordance if true
`outputPath` | `-o, --outputPath` | `"concordance.tsv"` | location where the concordance file should be generated

## Contributing

[Report an issue or suggest a feature here.][issues]
Expand Down
11 changes: 6 additions & 5 deletions concordance.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ const { version } = require(`./package.json`);

program.version(version, `-v, --version`, `output the current version`)
.arguments(`<wordforms>`, formatList)
.option(`-d, --dir <dir>`, `directory where the corpus is located`)
.option(`-o, --outputPath <outputPath>`, `location where the concordance file should be generated`);
.option(`-d, --dir <dir>`, `directory where the corpus is located`, `.`)
.option(`-k, --KWIC`, `whether to output the concordance in Keyword in Context (KWIC) format; if true, "pre" and "post" fields are added`, false)
.option(`-o, --outputPath <outputPath>`, `location where the concordance file should be generated`, `concordance.tsv`);

program.parse(process.argv);

Expand All @@ -21,7 +22,7 @@ function formatList(str) {
return str.split(/\s*,\s*/u);
}

const wordforms = formatList(wordformsArg);
const { dir, outputPath } = program.opts();
const wordforms = formatList(wordformsArg);
const options = program.opts();

concordance(wordforms, dir, outputPath);
concordance(wordforms, options);
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"dlx-conc": "./concordance.js"
},
"scripts": {
"test": "node concordance.js and,the -d test -o test/concordance.tsv"
"test": "node concordance.js -d test -o test/concordance.tsv -k little"
},
"engines": {
"node": ">=12.0"
Expand Down
85 changes: 67 additions & 18 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ const defaultColumns = [
`token`,
];

const kwicColumns = [...defaultColumns.slice(0, 3), `pre`, `token`, `post`];

// METHODS

/**
Expand All @@ -43,6 +45,30 @@ function chunk(arr, size) {

}

/**
* Accepts an array of DLx Word Token objects and concatenates their transcriptions
* @param {Array} words An array of DLx Word Tokens
* @return {String}
*/
function concatWords(words) {
return words
.map(({ transcription: t }) => getTranscription(t))
.join(` `);
}

/**
* Retrieves the value of a transcription
* @param {Object|String} txn The value of the DLx transcription property
* @return {String}
*/
function getTranscription(txn) {
return typeof txn === `string` ?
txn :
txn.eng
|| txn.en
|| txn[Object.keys(txn)[0]];
}

/**
* Ignore method passed to recursive-readdir. Ignores all non-JSON files.
* @param {String} filePath The file path to check
Expand All @@ -60,9 +86,11 @@ function ignore(filePath, stats) {
* @param {Array} wordforms An array of wordforms to concordance
* @param {Stream} csvStream The CSV stream
* @param {Object} progressBar The progress bar to increment
* @param {Object} options The options passed to the concordance method
* @return {Promise}
*/
function processFile(filePath, wordforms, csvStream, progressBar) {
// eslint-disable-next-line max-params
function processFile(filePath, wordforms, csvStream, progressBar, options) {
return new Promise((resolve, reject) => {

const title = path.basename(filePath, `.json`);
Expand All @@ -76,22 +104,27 @@ function processFile(filePath, wordforms, csvStream, progressBar) {

words.forEach(({ transcription }, i) => {

const txn = typeof transcription === `string` ?
transcription :
transcription.eng
|| transcription.en
|| transcription[Object.keys(transcription)[0]];
const txn = getTranscription(transcription);

if (!wordforms.includes(txn)) return;

const wordNum = i + 1;
const record = [title, utteranceNum, wordNum];

if (options.KWIC) {

csvStream.write([
title,
utteranceNum,
wordNum,
txn,
]);
const pre = concatWords(words.slice(0, i));
const post = concatWords(words.slice(i + 1));

record.push(...[pre, txn, post]);

} else {

record.push(txn);

}

csvStream.write(record);

});

Expand All @@ -117,27 +150,42 @@ function processFile(filePath, wordforms, csvStream, progressBar) {
* @param {Array} wordforms Array of wordforms to list the tokens of
* @param {Stream} csvStream The CSV stream
* @param {Object} progressBar The progress bar to increment
* @param {Object} options The options passed to the concordance method
* @return {Promise}
*/
function processFiles(files, wordforms, csvStream, progressBar) {
return Promise.all(files.map(file => processFile(file, wordforms, csvStream, progressBar)));
function processFiles(files, wordforms, csvStream, progressBar, options) { // eslint-disable-line max-params
return Promise.all(files.map(file => processFile(file, wordforms, csvStream, progressBar, options)));
}

// MAIN

/**
* Create a concordance file for a set of wordforms in a JSON corpus
* @param {String|Array} wordforms A wordform or array of wordforms to concordance
* @param {Object} [options={}] An options hash
* @param {String} [options.dir=`.`] The directory where the JSON corpus is located
* @param {String} [options.outputPath=`concordance.tsv`] The path where the concordance file should be generated
* @param {Boolean} [options.KWIC=false] Whether to output the concordance in Keyword in Context (KWIC) format, with "pre" and "post" columns
* @return {Promise}
*/
async function concordance(
wordforms,
dir = `.`,
outputPath = `concordance.tsv`
options = {},
) {

wordforms = Array.from(wordforms); // eslint-disable-line no-param-reassign

const {
dir = `.`,
KWIC = false,
outputPath = `concordance.tsv`,
} = options;

const files = await recurse(dir, [ignore]);
const fileGroups = chunk(files, 10);

const csvOptions = {
columns: defaultColumns,
columns: KWIC ? kwicColumns : defaultColumns,
delimiter: `\t`,
header: true,
quote: false,
Expand All @@ -156,7 +204,8 @@ async function concordance(
fileGroup,
wordforms,
csvStream,
progressBar
progressBar,
{ KWIC }
);
}

Expand Down
38 changes: 6 additions & 32 deletions test/concordance.tsv
Original file line number Diff line number Diff line change
@@ -1,32 +1,6 @@
text utterance word token
harry-potter 1 2 and
harry-potter 2 3 the
harry-potter 3 4 the
harry-potter 5 5 and
harry-potter 5 7 and
harry-potter 5 11 the
harry-potter 5 35 the
harry-potter 6 1 the
harry-potter 6 9 and
harry-potter 7 1 the
harry-potter 7 13 and
harry-potter 8 13 the
harry-potter 9 27 and
harry-potter 10 1 the
harry-potter 10 7 the
harry-potter 10 12 the
harry-potter 10 16 the
three-little-pigs 1 16 and
three-little-pigs 2 12 the
three-little-pigs 3 1 the
three-little-pigs 4 8 and
three-little-pigs 5 1 the
three-little-pigs 5 16 and
three-little-pigs 6 4 and
three-little-pigs 6 6 and
three-little-pigs 6 9 the
three-little-pigs 6 12 the
three-little-pigs 7 1 the
three-little-pigs 7 9 and
three-little-pigs 8 11 and
three-little-pigs 9 7 the
text utterance word pre token post
three-little-pigs 1 14 once upon a time there was an old mother pig who had three little pigs and not enough food to feed them
three-little-pigs 3 3 the first little pig was very lazy
three-little-pigs 5 3 the second little pig worked a little bit harder but he was somewhat lazy too and he built his house out of sticks
three-little-pigs 5 7 the second little pig worked a little bit harder but he was somewhat lazy too and he built his house out of sticks
three-little-pigs 7 3 the third little pig worked hard all day and built his house with bricks

0 comments on commit b9ad470

Please sign in to comment.