From c13dc1d1817ad3870ae66022f995056820fae6f1 Mon Sep 17 00:00:00 2001 From: Joe Gilvary Date: Fri, 17 May 2024 18:45:14 -0400 Subject: [PATCH] NUTCH-3057 - Fix for index-arbitrary plugin improper retention and use of calculated value for arbitrary field after an exception --- .../arbitrary/ArbitraryIndexingFilter.java | 3 + .../TestArbitraryIndexingFilter.java | 58 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java index 7677ef7f8..21a4537bf 100644 --- a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java +++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java @@ -170,6 +170,7 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, int cfgCounter = 0; while (cfgCounter < arbitraryAddsCount) { + result = null; setIndexedConf(conf,cfgCounter); cfgCounter++; try { @@ -184,6 +185,7 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, LOG.error("Exception preparing reflection tasks. className was {}", String.valueOf(className)); e.printStackTrace(); + continue; } try { constrArgs = new String[userConstrArgs.length + 1]; @@ -207,6 +209,7 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, LOG.error("methodArgs[0] was {}", String.valueOf(methodArgs[0])); } e.printStackTrace(); + continue; } LOG.debug("{}.{}() returned {} for field {}.", className, diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java index 17f31b183..adaecf55c 100644 --- a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java +++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java @@ -219,4 +219,62 @@ public void testOverwritingExistingField() throws Exception { Assert.assertTrue("field philosopher does not have new value 'Popeye'", doc.getField("philosopher") .getValues().contains("Popeye")); } + + /** + * Test processing a field after exception processing earlier field + * + * @throws Exception + */ + @Test + public void testProcessingFieldAfterException() throws Exception { + conf = NutchConfiguration.create(); + conf.set("index.arbitrary.function.count","3"); + conf.set("index.arbitrary.fieldName.0","foo"); + conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo"); + conf.set("index.arbitrary.constructorArgs.0","first added value"); + conf.set("index.arbitrary.methodName.0","getText"); + + conf.set("index.arbitrary.fieldName.1","mangled"); + conf.set("index.arbitrary.className.1","java.lang.String"); + conf.set("index.arbitrary.constructorArgs.1","bar"); + conf.set("index.arbitrary.methodName.1","noExistingMethod"); + conf.set("index.arbitrary.methodArgs.1","100"); + conf.set("index.arbitrary.overwrite.1","true"); + + conf.set("index.arbitrary.fieldName.2","philosopher"); + conf.set("index.arbitrary.className.2","org.apache.nutch.indexer.arbitrary.Echo"); + conf.set("index.arbitrary.constructorArgs.2","last added value"); + conf.set("index.arbitrary.methodName.2","getText"); + conf.set("index.arbitrary.overwrite.2","true"); + + filter = new ArbitraryIndexingFilter(); + Assert.assertNotNull("No filter exists for testProcessingFieldAfterException",filter); + + filter.setConf(conf); + Assert.assertNotNull("conf does not exist",conf); + + doc = new NutchDocument(); + + Assert.assertNotNull("doc does not exist",doc); + + try { + filter.filter(doc, parse, url, crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(System.out); + Assert.fail(e.getMessage()); + } + + Assert.assertNotNull(doc); + + Assert.assertTrue("field foo does not have 'first added value'", doc.getField("foo") + .getValues().contains("first added value")); + + Assert.assertNull("field mangled has a value", doc.getField("mangled")); + + Assert.assertFalse("Value 'first added value' has leaked into field philospoher", doc.getField("philosopher") + .getValues().contains("first added value")); + + Assert.assertTrue("field philosopher does not have new value 'last added value'", doc.getField("philosopher") + .getValues().contains("last added value")); + } }