Skip to content

Commit

Permalink
Refactor XLIFF Import AA and add unittests
Browse files Browse the repository at this point in the history
    This pulls out most of the target injection/alignment code into
    a separate class for easier testing.
  • Loading branch information
tingley committed Nov 28, 2016
1 parent 32d250a commit 27b52a8
Show file tree
Hide file tree
Showing 8 changed files with 520 additions and 142 deletions.
7 changes: 7 additions & 0 deletions autoactions/xliff/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@
<artifactId>okapi-ws-filters-xliff</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.spartansoftwareinc.ws.okapi.filters</groupId>
<artifactId>okapi-ws-filters-base</artifactId>
<version>${project.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,14 @@
package com.spartansoftwareinc.ws.autoactions.xliff;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.filters.xliff.XLIFFFilter;

import org.apache.log4j.Logger;

import com.idiominc.wssdk.WSContext;
import com.idiominc.wssdk.WSException;
import com.idiominc.wssdk.WSRuntimeException;
import com.idiominc.wssdk.ais.WSNode;
import com.idiominc.wssdk.asset.WSAssetTask;
import com.idiominc.wssdk.asset.WSAssetTranslation;
Expand All @@ -31,7 +18,6 @@
import com.idiominc.wssdk.component.WSParameterFactory;
import com.idiominc.wssdk.component.autoaction.WSActionResult;
import com.idiominc.wssdk.component.autoaction.WSTaskAutomaticAction;
import com.idiominc.wssdk.component.filter.WSFilter;
import com.idiominc.wssdk.workflow.WSTask;
import com.spartansoftwareinc.ws.okapi.Version;
import com.spartansoftwareinc.ws.okapi.filters.utils.FilterUtil;
Expand All @@ -50,7 +36,6 @@ public class ImportXLIFFTargetsAutomaticAction extends WSTaskAutomaticAction {

private AssetType assetType;
private WSTranslationType injectedTranslationType;
private int nextPlaceholderId = 1;

@Override
public String getDescription() {
Expand Down Expand Up @@ -93,135 +78,21 @@ protected WSNode getAsset(WSAssetTask assetTask) {
}
}

// TODO: it would be cool to do some refactoring so this code always
// stayed in sync with OkapiFilterBridge, upon the behavior of
// which this depends. The current behavior of that class is to
// produce one WS text segment for each Okapi Segment object within
// the source TextContainer.
int injectTargetContent(WSNode node, WSAssetTranslation translation)
throws WSException {
List<ITextUnit> xliffTus = getEvents(node);
LocaleId okapiSrcLocale = FilterUtil.getOkapiLocaleId(node);
String encoding = node.getEncoding() != null ?
node.getEncoding() : XLIFFWSOkapiFilter.DEFAULT_XLIFF_ENCODING;
@SuppressWarnings("unchecked")
Iterator<WSTextSegmentTranslation> textSegs = (Iterator<WSTextSegmentTranslation>)translation.textSegmentIterator();
int count = 0;
for (ITextUnit xliffTu : xliffTus) {
TextContainer sourceTc = xliffTu.getSource();
TextContainer targetTc = findFirstTarget(xliffTu);
if (targetTc == null) {
for (@SuppressWarnings("unused")
Segment seg : sourceTc.getSegments()) {
skipSegment(textSegs);
}
continue;
}
for (Segment seg : targetTc.getSegments()) {
boolean injected = injectNextSegment(seg, textSegs);
if (injected) {
count++;
}
}
XLIFFTargetContentAligner aligner = new XLIFFTargetContentAligner(injectedTranslationType);
try {
int count = aligner.alignTargetContent(node.getInputStream(), encoding, okapiSrcLocale, textSegs);
LOG.info("Imported " + count + " segment translations");
return count;
}
LOG.info("Imported " + count + " segment translations");
return count;
}

TextContainer findFirstTarget(ITextUnit tu) {
Set<LocaleId> locales = tu.getTargetLocales();
LocaleId first = locales.iterator().next();
LOG.debug("First target locale is " + first);
return tu.getTarget(first);
}

void skipSegment(Iterator<WSTextSegmentTranslation> textSegs) {
checkForMoreWSSegments(textSegs);
WSTextSegmentTranslation textSeg = textSegs.next();
LOG.info("Skipping segment [" + textSeg.getSource() + "]");
}

/**
* Update the next text segment translation with content from the XLIFF. Do nothing
* if the translation was already the same as the XLIFF content.
* @return true if the translation was updated, false if the translation was already
* the same as the XLIFF content.
*/
boolean injectNextSegment(Segment xliffSeg, Iterator<WSTextSegmentTranslation> textSegs) {
checkForMoreWSSegments(textSegs);
WSTextSegmentTranslation textSeg = textSegs.next();

WSTextSegmentData wsMatch = WSTextSegmentData.fromOkapiSegment(xliffSeg);
String text = assignPlaceholderIds(wsMatch.getText());
if (textSeg.getTarget() == null || !textSeg.getTarget().equals(text)) {
LOG.info("Overwriting existing target=[" + textSeg.getTarget() + "] with new target=[" + text + "]");
textSeg.setTarget(text);
textSeg.setTranslationType(injectedTranslationType);
return true;
} else {
return false;
}
}

Pattern PH_PATTERN = Pattern.compile("\\{(\\d+)\\}");

private String assignPlaceholderIds(String text) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c == '{') {
String test = text.substring(i);
if (test.startsWith(WSFilter.PLACEHOLDER)) {
sb.append("{").append(Integer.toString(nextPlaceholderId++)).append("}");
i += WSFilter.PLACEHOLDER.length() - 1;
}
else {
// Escape "fake placeholders"
Matcher m = PH_PATTERN.matcher(test);
if (m.lookingAt()) {
sb.append("\\{").append(m.group(1)).append("\\}");
i += m.group().length() - 1;
}
else {
sb.append(c);
}
}
}
else {
sb.append(c);
}
}
return sb.toString();
}

private void checkForMoreWSSegments(Iterator<WSTextSegmentTranslation> textSegs) {
if (!textSegs.hasNext()) {
throw new IllegalStateException("Source XLIFF contains more segments than asset");
}
}

private List<ITextUnit> getEvents(WSNode node) throws WSException {
LocaleId okapiSrcLocale = FilterUtil.getOkapiLocaleId(node);
File tempFile = null;
try (XLIFFFilter filter = new XLIFFFilter()) {
// Filter may need multiple passes, so we need to buffer this to a
// temp file
tempFile = FilterUtil.convertAisContentIntoFile(node);
String encoding = node.getEncoding() != null ?
node.getEncoding() : XLIFFWSOkapiFilter.DEFAULT_XLIFF_ENCODING;
RawDocument rd = new RawDocument(tempFile.toURI(), encoding, okapiSrcLocale, okapiSrcLocale);
filter.open(rd, false);
List<ITextUnit> tus = new ArrayList<ITextUnit>();
while (filter.hasNext()) {
Event e = filter.next();
if (e.isTextUnit()) {
tus.add(e.getTextUnit());
}
}
return tus;
} catch (IOException e) {
throw new WSException(e);
} finally {
if (tempFile != null) {
tempFile.delete();
}
catch (IOException e) {
throw new WSRuntimeException(e);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package com.spartansoftwareinc.ws.autoactions.xliff;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import com.idiominc.wssdk.asset.WSTextSegmentTranslation;
import com.idiominc.wssdk.asset.WSTranslationType;
import com.idiominc.wssdk.component.filter.WSFilter;
import com.spartansoftwareinc.ws.okapi.filters.utils.FilterUtil;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.filters.xliff.XLIFFFilter;

class XLIFFTargetContentAligner {
private static final Logger LOG = Logger
.getLogger(XLIFFTargetContentAligner.class);

private WSTranslationType injectedTranslationType;
private int nextPlaceholderId = 1;

public XLIFFTargetContentAligner(WSTranslationType injectedTranslationType) {
this.injectedTranslationType = injectedTranslationType;
}

// TODO: it would be cool to do some refactoring so this code always
// stayed in sync with OkapiFilterBridge, upon the behavior of
// which this depends. The current behavior of that class is to
// produce one WS text segment for each Okapi Segment object within
// the source TextContainer.
public int alignTargetContent(InputStream xliffStream, String encoding, LocaleId srcLocale,
Iterator<WSTextSegmentTranslation> textSegs) throws IOException {
List<ITextUnit> xliffTus = getEvents(xliffStream, encoding, srcLocale);
int count = 0;
for (ITextUnit xliffTu : xliffTus) {
TextContainer sourceTc = xliffTu.getSource();
TextContainer targetTc = findFirstTarget(xliffTu);
if (targetTc == null) {
for (@SuppressWarnings("unused")
Segment seg : sourceTc.getSegments()) {
skipSegment(textSegs);
}
continue;
}
for (Segment seg : targetTc.getSegments()) {
boolean injected = injectNextSegment(seg, textSegs);
if (injected) {
count++;
}
}
}
return count;
}

private List<ITextUnit> getEvents(InputStream is, String encoding, LocaleId srcLocale) throws IOException {
File tempFile = null;
try (XLIFFFilter filter = new XLIFFFilter()) {
// Filter may need multiple passes, so we need to buffer this to a
// temp file
tempFile = FilterUtil.convertContentIntoFile(is, ".xlf");
RawDocument rd = new RawDocument(tempFile.toURI(), encoding, srcLocale, srcLocale);
filter.open(rd, false);
List<ITextUnit> tus = new ArrayList<ITextUnit>();
while (filter.hasNext()) {
Event e = filter.next();
if (e.isTextUnit()) {
tus.add(e.getTextUnit());
}
}
return tus;
} finally {
if (tempFile != null) {
tempFile.delete();
}
}
}

TextContainer findFirstTarget(ITextUnit tu) {
Set<LocaleId> locales = tu.getTargetLocales();
LocaleId first = locales.iterator().next();
LOG.debug("First target locale is " + first);
return tu.getTarget(first);
}

void skipSegment(Iterator<WSTextSegmentTranslation> textSegs) {
checkForMoreWSSegments(textSegs);
WSTextSegmentTranslation textSeg = textSegs.next();
LOG.info("Skipping segment [" + textSeg.getSource() + "]");
}

/**
* Update the next text segment translation with content from the XLIFF. Do nothing
* if the translation was already the same as the XLIFF content.
* @return true if the translation was updated, false if the translation was already
* the same as the XLIFF content.
*/
boolean injectNextSegment(Segment xliffSeg, Iterator<WSTextSegmentTranslation> textSegs) {
checkForMoreWSSegments(textSegs);
WSTextSegmentTranslation textSeg = textSegs.next();

WSTextSegmentData wsMatch = WSTextSegmentData.fromOkapiSegment(xliffSeg);
String text = assignPlaceholderIds(wsMatch.getText());
if (textSeg.getTarget() == null || !textSeg.getTarget().equals(text)) {
LOG.info("Overwriting existing target=[" + textSeg.getTarget() + "] with new target=[" + text + "]");
textSeg.setTarget(text);
textSeg.setTranslationType(injectedTranslationType);
return true;
} else {
return false;
}
}

Pattern PH_PATTERN = Pattern.compile("\\{(\\d+)\\}");

private String assignPlaceholderIds(String text) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c == '{') {
String test = text.substring(i);
if (test.startsWith(WSFilter.PLACEHOLDER)) {
sb.append("{").append(Integer.toString(nextPlaceholderId++)).append("}");
i += WSFilter.PLACEHOLDER.length() - 1;
}
else {
// Escape "fake placeholders"
Matcher m = PH_PATTERN.matcher(test);
if (m.lookingAt()) {
sb.append("\\{").append(m.group(1)).append("\\}");
i += m.group().length() - 1;
}
else {
sb.append(c);
}
}
}
else {
sb.append(c);
}
}
return sb.toString();
}

private void checkForMoreWSSegments(Iterator<WSTextSegmentTranslation> textSegs) {
if (!textSegs.hasNext()) {
throw new IllegalStateException("Source XLIFF contains more segments than asset");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package com.spartansoftwareinc.ws.autoactions.xliff;

import org.junit.Test;
import static org.junit.Assert.*;

import com.idiominc.wssdk.component.filter.WSFilter;

import net.sf.okapi.common.resource.Code;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextFragment.TagType;

public class WSTextSegmentDataTest {

@Test
public void fromOkapiSegment() {
TextFragment tf = new TextFragment("Hello ");
tf.append(new Code(TagType.OPENING, "bold", "<b>"));
tf.append("world");
tf.append(new Code(TagType.CLOSING, "bold", "</b>"));
Segment segment = new Segment("seg1", tf);
WSTextSegmentData data = WSTextSegmentData.fromOkapiSegment(segment);
assertEquals("Hello " + WSFilter.PLACEHOLDER + "world" + WSFilter.PLACEHOLDER, data.getText());
assertArrayEquals(new String[] { "<b>", "</b>" }, data.getPlaceholders());
}
}
Loading

0 comments on commit 27b52a8

Please sign in to comment.