diff -rBNu src.orig/java/org/apache/nutch/analysis/NutchAnalysisConstants.java src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java --- src.orig/java/org/apache/nutch/analysis/NutchAnalysisConstants.java 2009-03-10 11:34:01.000000000 -0700 +++ src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java 2009-03-10 14:11:55.000000000 -0700 @@ -4,30 +4,34 @@ public interface NutchAnalysisConstants { int EOF = 0; - int WORD = 1; - int ACRONYM = 2; - int SIGRAM = 3; - int IRREGULAR_WORD = 4; - int C_PLUS_PLUS = 5; - int C_SHARP = 6; - int PLUS = 7; - int MINUS = 8; - int QUOTE = 9; - int COLON = 10; - int SLASH = 11; - int DOT = 12; - int ATSIGN = 13; - int APOSTROPHE = 14; - int WHITE = 15; - int WORD_PUNCT = 16; - int LETTER = 17; - int CJK = 18; - int DIGIT = 19; + int QOR = 1; + int WORD = 2; + int ACRONYM = 3; + int SIGRAM = 4; + int IRREGULAR_WORD = 5; + int C_PLUS_PLUS = 6; + int C_SHARP = 7; + int PLUS = 8; + int MINUS = 9; + int QUOTE = 10; + int COLON = 11; + int SLASH = 12; + int LPAREN = 13; + int RPAREN = 14; + int DOT = 15; + int ATSIGN = 16; + int APOSTROPHE = 17; + int WHITE = 18; + int WORD_PUNCT = 19; + int LETTER = 20; + int CJK = 21; + int DIGIT = 22; int DEFAULT = 0; String[] tokenImage = { "", + "\"OR\"", "", "", "", @@ -39,6 +43,8 @@ "\"\\\"\"", "\":\"", "\"/\"", + "\"(\"", + "\")\"", "\".\"", "\"@\"", "\"\\\'\"", diff -rBNu src.orig/java/org/apache/nutch/analysis/NutchAnalysis.java src/java/org/apache/nutch/analysis/NutchAnalysis.java --- src.orig/java/org/apache/nutch/analysis/NutchAnalysis.java 2009-03-10 11:34:01.000000000 -0700 +++ src/java/org/apache/nutch/analysis/NutchAnalysis.java 2009-03-10 14:11:55.000000000 -0700 @@ -56,7 +56,11 @@ queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); parser.queryString = queryString; parser.queryFilters = new QueryFilters(conf); - return parser.parse(conf); + try { + return parser.parse(conf); + } catch (ParseException pe) { + throw new IOException("Parse exception: " + pe); + } } /** For debugging. */ @@ -77,44 +81,60 @@ String field; boolean stop; boolean prohibited; + boolean required; nonOpOrTerm(); label_1: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case QOR: case WORD: case ACRONYM: case SIGRAM: case PLUS: case MINUS: case QUOTE: + case WHITE: ; break; default: jj_la1[0] = jj_gen; break label_1; } - stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case PLUS: - case MINUS: + stop=true; prohibited=false; required=true; field = Clause.DEFAULT_FIELD; + label_2: + while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case QOR: + case PLUS: + case MINUS: + case WHITE: + ; + break; + default: + jj_la1[1] = jj_gen; + break label_2; + } + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case QOR: + disjunction(); + query.setLastTermNotRequired(); required=false; + break; case PLUS: jj_consume_token(PLUS); - stop=false; + stop=false;required=true; break; case MINUS: jj_consume_token(MINUS); - stop=false;prohibited=true; + stop=false;prohibited=true; + break; + case WHITE: + jj_consume_token(WHITE); break; default: - jj_la1[1] = jj_gen; + jj_la1[2] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - break; - default: - jj_la1[2] = jj_gen; - ; } if (jj_2_1(2147483647)) { token = jj_consume_token(WORD); @@ -150,15 +170,18 @@ } else { if (prohibited) query.addProhibitedPhrase(array, field); - else + else if (required) query.addRequiredPhrase(array, field); + else { + query.addNormalPhrase(array, field); + } } } {if (true) return query;} throw new Error("Missing return statement in function"); } -/** Parse an explcitly quoted phrase query. Note that this may return a single +/** Parse an explicitly quoted phrase query. Note that this may return a single * term, a trivial phrase.*/ final public ArrayList phrase(String field) throws ParseException { int start; @@ -167,7 +190,7 @@ String term; jj_consume_token(QUOTE); start = token.endColumn; - label_2: + label_3: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: @@ -182,11 +205,11 @@ break; default: jj_la1[4] = jj_gen; - break label_2; + break label_3; } nonTerm(); } - label_3: + label_4: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: @@ -196,11 +219,11 @@ break; default: jj_la1[5] = jj_gen; - break label_3; + break label_4; } term = term(); result.add(term); - label_4: + label_5: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: @@ -215,7 +238,7 @@ break; default: jj_la1[6] = jj_gen; - break label_4; + break label_5; } nonTerm(); } @@ -243,7 +266,7 @@ /** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters. Note that - * htis may return a single term, a trivial compound. */ + * this may return a single term, a trivial compound. */ final public ArrayList compound(String field) throws ParseException { int start; ArrayList result = new ArrayList(); @@ -254,14 +277,14 @@ terms.append(term).append(" "); //result.add(term); - label_5: + label_6: while (true) { if (jj_2_2(2147483647)) { ; } else { - break label_5; + break label_6; } - label_6: + label_7: while (true) { infix(); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { @@ -276,7 +299,7 @@ break; default: jj_la1[8] = jj_gen; - break label_6; + break label_7; } } term = term(); @@ -356,6 +379,10 @@ } } + final public void disjunction() throws ParseException { + jj_consume_token(QOR); + } + final public void nonTermOrEOF() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: @@ -378,14 +405,14 @@ } } -/** Parse anything but a term or an operator (plur or minus or quote). */ +/** Parse anything but a term or an operator (plus or minus or quote). */ final public void nonOpOrTerm() throws ParseException { - label_7: + label_8: while (true) { if (jj_2_3(2)) { ; } else { - break label_7; + break label_8; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WHITE: @@ -491,213 +518,213 @@ finally { jj_save(2, xla); } } - final private boolean jj_3_1() { - if (jj_scan_token(WORD)) return true; - if (jj_scan_token(COLON)) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_8()) { - jj_scanpos = xsp; - if (jj_3R_9()) return true; - } + final private boolean jj_3R_24() { + if (jj_3R_25()) return true; return false; } - final private boolean jj_3R_16() { + final private boolean jj_3R_19() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(7)) { + if (jj_3R_24()) { jj_scanpos = xsp; - if (jj_scan_token(8)) { - jj_scanpos = xsp; - if (jj_3R_22()) return true; - } + if (jj_scan_token(0)) return true; } return false; } - final private boolean jj_3_3() { + final private boolean jj_3R_14() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(15)) { - jj_scanpos = xsp; - if (jj_3R_12()) { + if (jj_scan_token(8)) { jj_scanpos = xsp; - if (jj_3R_13()) return true; - } + if (jj_scan_token(9)) return true; } + if (jj_3R_19()) return true; return false; } - final private boolean jj_3R_25() { - if (jj_3R_24()) return true; + final private boolean jj_3R_26() { + if (jj_3R_25()) return true; return false; } - final private boolean jj_3R_27() { - if (jj_3R_16()) return true; + final private boolean jj_3R_28() { + if (jj_3R_17()) return true; return false; } - final private boolean jj_3R_20() { - if (jj_3R_11()) return true; + final private boolean jj_3R_21() { + if (jj_3R_12()) return true; Token xsp; while (true) { xsp = jj_scanpos; - if (jj_3R_25()) { jj_scanpos = xsp; break; } + if (jj_3R_26()) { jj_scanpos = xsp; break; } } return false; } - final private boolean jj_3R_10() { - if (jj_3R_16()) return true; - return false; - } - - final private boolean jj_3R_19() { - if (jj_3R_24()) return true; + final private boolean jj_3R_11() { + if (jj_3R_17()) return true; return false; } final private boolean jj_3_2() { Token xsp; - if (jj_3R_10()) return true; + if (jj_3R_11()) return true; while (true) { xsp = jj_scanpos; - if (jj_3R_10()) { jj_scanpos = xsp; break; } + if (jj_3R_11()) { jj_scanpos = xsp; break; } } - if (jj_3R_11()) return true; + if (jj_3R_12()) return true; return false; } - final private boolean jj_3R_23() { - if (jj_3R_24()) return true; + final private boolean jj_3R_20() { + if (jj_3R_25()) return true; return false; } - final private boolean jj_3R_18() { + final private boolean jj_3R_10() { + if (jj_3R_16()) return true; + return false; + } + + final private boolean jj_3R_15() { + if (jj_scan_token(QUOTE)) return true; Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_20()) { jj_scanpos = xsp; break; } + } + while (true) { + xsp = jj_scanpos; + if (jj_3R_21()) { jj_scanpos = xsp; break; } + } xsp = jj_scanpos; - if (jj_3R_23()) { + if (jj_scan_token(10)) { jj_scanpos = xsp; if (jj_scan_token(0)) return true; } return false; } - final private boolean jj_3R_13() { + final private boolean jj_3R_25() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(7)) { + if (jj_scan_token(18)) { jj_scanpos = xsp; - if (jj_scan_token(8)) return true; + if (jj_3R_28()) return true; } + return false; + } + + final private boolean jj_3R_27() { + if (jj_3R_17()) return true; + return false; + } + + final private boolean jj_3R_23() { if (jj_3R_18()) return true; return false; } - final private boolean jj_3R_9() { - if (jj_3R_15()) return true; + final private boolean jj_3R_13() { + if (jj_3R_18()) return true; return false; } - final private boolean jj_3R_14() { - if (jj_scan_token(QUOTE)) return true; + final private boolean jj_3R_22() { Token xsp; + if (jj_3R_27()) return true; while (true) { xsp = jj_scanpos; - if (jj_3R_19()) { jj_scanpos = xsp; break; } - } - while (true) { - xsp = jj_scanpos; - if (jj_3R_20()) { jj_scanpos = xsp; break; } - } - xsp = jj_scanpos; - if (jj_scan_token(9)) { - jj_scanpos = xsp; - if (jj_scan_token(0)) return true; + if (jj_3R_27()) { jj_scanpos = xsp; break; } } + if (jj_3R_12()) return true; return false; } - final private boolean jj_3R_24() { + final private boolean jj_3R_12() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(15)) { + if (jj_scan_token(2)) { jj_scanpos = xsp; - if (jj_3R_27()) return true; + if (jj_scan_token(3)) { + jj_scanpos = xsp; + if (jj_scan_token(4)) return true; + } } return false; } - final private boolean jj_3R_26() { - if (jj_3R_16()) return true; + final private boolean jj_3R_9() { + if (jj_3R_15()) return true; return false; } - final private boolean jj_3R_21() { + final private boolean jj_3R_16() { + if (jj_3R_12()) return true; Token xsp; - if (jj_3R_26()) return true; while (true) { xsp = jj_scanpos; - if (jj_3R_26()) { jj_scanpos = xsp; break; } + if (jj_3R_22()) { jj_scanpos = xsp; break; } } - if (jj_3R_11()) return true; return false; } - final private boolean jj_3R_22() { - if (jj_3R_17()) return true; - return false; - } - - final private boolean jj_3R_8() { - if (jj_3R_14()) return true; - return false; - } - - final private boolean jj_3R_12() { - if (jj_3R_17()) return true; + final private boolean jj_3R_18() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(11)) { + jj_scanpos = xsp; + if (jj_scan_token(12)) { + jj_scanpos = xsp; + if (jj_scan_token(15)) { + jj_scanpos = xsp; + if (jj_scan_token(16)) { + jj_scanpos = xsp; + if (jj_scan_token(17)) return true; + } + } + } + } return false; } - final private boolean jj_3R_11() { + final private boolean jj_3R_17() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(1)) { + if (jj_scan_token(8)) { jj_scanpos = xsp; - if (jj_scan_token(2)) { + if (jj_scan_token(9)) { jj_scanpos = xsp; - if (jj_scan_token(3)) return true; + if (jj_3R_23()) return true; } } return false; } - final private boolean jj_3R_15() { - if (jj_3R_11()) return true; + final private boolean jj_3_1() { + if (jj_scan_token(WORD)) return true; + if (jj_scan_token(COLON)) return true; Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_21()) { jj_scanpos = xsp; break; } + xsp = jj_scanpos; + if (jj_3R_9()) { + jj_scanpos = xsp; + if (jj_3R_10()) return true; } return false; } - final private boolean jj_3R_17() { + final private boolean jj_3_3() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(10)) { + if (jj_scan_token(18)) { jj_scanpos = xsp; - if (jj_scan_token(11)) { + if (jj_3R_13()) { jj_scanpos = xsp; - if (jj_scan_token(12)) { - jj_scanpos = xsp; - if (jj_scan_token(13)) { - jj_scanpos = xsp; - if (jj_scan_token(14)) return true; - } - } + if (jj_3R_14()) return true; } } return false; @@ -717,7 +744,7 @@ jj_la1_0(); } private static void jj_la1_0() { - jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,}; + jj_la1_0 = new int[] {0x4071e,0x40302,0x40302,0x41c,0x79b00,0x1c,0x79b00,0x401,0x39b00,0x1c,0x79b00,0x79b01,0x300,0x79b00,0x39b00,0x39800,}; } final private JJCalls[] jj_2_rtns = new JJCalls[3]; private boolean jj_rescan = false; @@ -866,8 +893,8 @@ public ParseException generateParseException() { jj_expentries.removeAllElements(); - boolean[] la1tokens = new boolean[20]; - for (int i = 0; i < 20; i++) { + boolean[] la1tokens = new boolean[23]; + for (int i = 0; i < 23; i++) { la1tokens[i] = false; } if (jj_kind >= 0) { @@ -883,7 +910,7 @@ } } } - for (int i = 0; i < 20; i++) { + for (int i = 0; i < 23; i++) { if (la1tokens[i]) { jj_expentry = new int[1]; jj_expentry[0] = i; diff -rBNu src.orig/java/org/apache/nutch/analysis/NutchAnalysis.java.old src/java/org/apache/nutch/analysis/NutchAnalysis.java.old --- src.orig/java/org/apache/nutch/analysis/NutchAnalysis.java.old 1969-12-31 16:00:00.000000000 -0800 +++ src/java/org/apache/nutch/analysis/NutchAnalysis.java.old 2009-01-29 09:28:01.000000000 -0800 @@ -0,0 +1,946 @@ +/* Generated By:JavaCC: Do not edit this line. NutchAnalysis.java */ +package org.apache.nutch.analysis; + +import java.io.StringReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.QueryFilters; +import org.apache.nutch.searcher.Query.Clause; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.*; +import java.util.*; + +/** The JavaCC-generated Nutch lexical analyzer and query parser. */ +public class NutchAnalysis implements NutchAnalysisConstants { + + private static final String[] STOP_WORDS = { + "a", "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "s", "such", + "t", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with" + }; + + private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); + + private Analyzer analyzer = null; + private String queryString; + private QueryFilters queryFilters; + + + /** Constructs a nutch analysis. */ + public NutchAnalysis(String query, Analyzer analyzer) { + this(new FastCharStream(new StringReader(query))); + this.analyzer = analyzer; + } + + /** True iff word is a stop word. Stop words are only removed from queries. + * Every word is indexed. */ + public static boolean isStopWord(String word) { + return STOP_SET.contains(word); + } + + /** Construct a query parser for the text in a reader. */ + public static Query parseQuery(String queryString, Configuration conf) throws IOException { + return parseQuery(queryString, null, conf); + } + + /** Construct a query parser for the text in a reader. */ + public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf) + throws IOException { + NutchAnalysis parser = new NutchAnalysis( + queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); + parser.queryString = queryString; + parser.queryFilters = new QueryFilters(conf); + return parser.parse(conf); + } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + while (true) { + System.out.print("Query: "); + String line = in.readLine(); + System.out.println(parseQuery(line, NutchConfiguration.create())); + } + } + +/** Parse a query. */ + final public Query parse(Configuration conf) throws ParseException { + Query query = new Query(conf); + ArrayList terms; + Token token; + String field; + boolean stop; + boolean prohibited; + nonOpOrTerm(); + label_1: + while (true) { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case WORD: + case ACRONYM: + case SIGRAM: + case PLUS: + case MINUS: + case QUOTE: + ; + break; + default: + jj_la1[0] = jj_gen; + break label_1; + } + stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + case MINUS: + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + jj_consume_token(PLUS); + stop=false; + break; + case MINUS: + jj_consume_token(MINUS); + stop=false;prohibited=true; + break; + default: + jj_la1[1] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + break; + default: + jj_la1[2] = jj_gen; + ; + } + if (jj_2_1(2147483647)) { + token = jj_consume_token(WORD); + jj_consume_token(COLON); + field = token.image; + } else { + ; + } + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case QUOTE: + terms = phrase(field); + stop=false; + break; + case WORD: + case ACRONYM: + case SIGRAM: + // quoted terms or + terms = compound(field); + break; + default: + jj_la1[3] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + nonOpOrTerm(); + String[] array = (String[])terms.toArray(new String[terms.size()]); + + if (stop + && field == Clause.DEFAULT_FIELD + && terms.size()==1 + && isStopWord(array[0])) { + // ignore stop words only when single, unadorned terms in default field + } else { + if (prohibited) + query.addProhibitedPhrase(array, field); + else + query.addRequiredPhrase(array, field); + } + } + {if (true) return query;} + throw new Error("Missing return statement in function"); + } + +/** Parse an explcitly quoted phrase query. Note that this may return a single + * term, a trivial phrase.*/ + final public ArrayList phrase(String field) throws ParseException { + int start; + int end; + ArrayList result = new ArrayList(); + String term; + jj_consume_token(QUOTE); + start = token.endColumn; + label_2: + while (true) { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + case MINUS: + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + case WHITE: + ; + break; + default: + jj_la1[4] = jj_gen; + break label_2; + } + nonTerm(); + } + label_3: + while (true) { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case WORD: + case ACRONYM: + case SIGRAM: + ; + break; + default: + jj_la1[5] = jj_gen; + break label_3; + } + term = term(); + result.add(term); + label_4: + while (true) { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + case MINUS: + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + case WHITE: + ; + break; + default: + jj_la1[6] = jj_gen; + break label_4; + } + nonTerm(); + } + } + end = token.endColumn; + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case QUOTE: + jj_consume_token(QUOTE); + break; + case 0: + jj_consume_token(0); + break; + default: + jj_la1[7] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + if (this.queryFilters.isRawField(field)) { + result.clear(); + result.add(queryString.substring(start, end)); + } + {if (true) return result;} + throw new Error("Missing return statement in function"); + } + +/** Parse a compound term that is interpreted as an implicit phrase query. + * Compounds are a sequence of terms separated by infix characters. Note that + * htis may return a single term, a trivial compound. */ + final public ArrayList compound(String field) throws ParseException { + int start; + ArrayList result = new ArrayList(); + String term; + StringBuffer terms = new StringBuffer(); + start = token.endColumn; + term = term(); + terms.append(term).append(" "); + //result.add(term); + + label_5: + while (true) { + if (jj_2_2(2147483647)) { + ; + } else { + break label_5; + } + label_6: + while (true) { + infix(); + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + case MINUS: + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + ; + break; + default: + jj_la1[8] = jj_gen; + break label_6; + } + } + term = term(); + terms.append(term).append(" "); + //result.add(term); + + } + if (this.queryFilters.isRawField(field)) { +// result.clear(); + result.add(queryString.substring(start, token.endColumn)); + + } else { + org.apache.lucene.analysis.Token token; + TokenStream tokens = analyzer.tokenStream( + field, new StringReader(terms.toString())); + + while (true) { + try { + token = tokens.next(); + } catch (IOException e) { + token = null; + } + if (token == null) { break; } + result.add(token.termText()); + } + try { + tokens.close(); + } catch (IOException e) { + // ignore + } + } + {if (true) return result;} + throw new Error("Missing return statement in function"); + } + +/** Parse a single term. */ + final public String term() throws ParseException { + Token token; + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case WORD: + token = jj_consume_token(WORD); + break; + case ACRONYM: + token = jj_consume_token(ACRONYM); + break; + case SIGRAM: + token = jj_consume_token(SIGRAM); + break; + default: + jj_la1[9] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + {if (true) return token.image;} + throw new Error("Missing return statement in function"); + } + +/** Parse anything but a term or a quote. */ + final public void nonTerm() throws ParseException { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case WHITE: + jj_consume_token(WHITE); + break; + case PLUS: + case MINUS: + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + infix(); + break; + default: + jj_la1[10] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } + + final public void nonTermOrEOF() throws ParseException { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + case MINUS: + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + case WHITE: + nonTerm(); + break; + case 0: + jj_consume_token(0); + break; + default: + jj_la1[11] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } + +/** Parse anything but a term or an operator (plur or minus or quote). */ + final public void nonOpOrTerm() throws ParseException { + label_7: + while (true) { + if (jj_2_3(2)) { + ; + } else { + break label_7; + } + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case WHITE: + jj_consume_token(WHITE); + break; + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + nonOpInfix(); + break; + case PLUS: + case MINUS: + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + jj_consume_token(PLUS); + break; + case MINUS: + jj_consume_token(MINUS); + break; + default: + jj_la1[12] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + nonTermOrEOF(); + break; + default: + jj_la1[13] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } + } + +/** Characters which can be used to form compound terms. */ + final public void infix() throws ParseException { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case PLUS: + jj_consume_token(PLUS); + break; + case MINUS: + jj_consume_token(MINUS); + break; + case COLON: + case SLASH: + case DOT: + case ATSIGN: + case APOSTROPHE: + nonOpInfix(); + break; + default: + jj_la1[14] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } + +/** Parse infix characters except plus and minus. */ + final public void nonOpInfix() throws ParseException { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case COLON: + jj_consume_token(COLON); + break; + case SLASH: + jj_consume_token(SLASH); + break; + case DOT: + jj_consume_token(DOT); + break; + case ATSIGN: + jj_consume_token(ATSIGN); + break; + case APOSTROPHE: + jj_consume_token(APOSTROPHE); + break; + default: + jj_la1[15] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } + + final private boolean jj_2_1(int xla) { + jj_la = xla; jj_lastpos = jj_scanpos = token; + try { return !jj_3_1(); } + catch(LookaheadSuccess ls) { return true; } + finally { jj_save(0, xla); } + } + + final private boolean jj_2_2(int xla) { + jj_la = xla; jj_lastpos = jj_scanpos = token; + try { return !jj_3_2(); } + catch(LookaheadSuccess ls) { return true; } + finally { jj_save(1, xla); } + } + + final private boolean jj_2_3(int xla) { + jj_la = xla; jj_lastpos = jj_scanpos = token; + try { return !jj_3_3(); } + catch(LookaheadSuccess ls) { return true; } + finally { jj_save(2, xla); } + } + + final private boolean jj_3_1() { + if (jj_scan_token(WORD)) return true; + if (jj_scan_token(COLON)) return true; + Token xsp; + xsp = jj_scanpos; + if (jj_3R_8()) { + jj_scanpos = xsp; + if (jj_3R_9()) return true; + } + return false; + } + + final private boolean jj_3R_16() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(7)) { + jj_scanpos = xsp; + if (jj_scan_token(8)) { + jj_scanpos = xsp; + if (jj_3R_22()) return true; + } + } + return false; + } + + final private boolean jj_3_3() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(15)) { + jj_scanpos = xsp; + if (jj_3R_12()) { + jj_scanpos = xsp; + if (jj_3R_13()) return true; + } + } + return false; + } + + final private boolean jj_3R_25() { + if (jj_3R_24()) return true; + return false; + } + + final private boolean jj_3R_27() { + if (jj_3R_16()) return true; + return false; + } + + final private boolean jj_3R_20() { + if (jj_3R_11()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_25()) { jj_scanpos = xsp; break; } + } + return false; + } + + final private boolean jj_3R_10() { + if (jj_3R_16()) return true; + return false; + } + + final private boolean jj_3R_19() { + if (jj_3R_24()) return true; + return false; + } + + final private boolean jj_3_2() { + Token xsp; + if (jj_3R_10()) return true; + while (true) { + xsp = jj_scanpos; + if (jj_3R_10()) { jj_scanpos = xsp; break; } + } + if (jj_3R_11()) return true; + return false; + } + + final private boolean jj_3R_23() { + if (jj_3R_24()) return true; + return false; + } + + final private boolean jj_3R_18() { + Token xsp; + xsp = jj_scanpos; + if (jj_3R_23()) { + jj_scanpos = xsp; + if (jj_scan_token(0)) return true; + } + return false; + } + + final private boolean jj_3R_13() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(7)) { + jj_scanpos = xsp; + if (jj_scan_token(8)) return true; + } + if (jj_3R_18()) return true; + return false; + } + + final private boolean jj_3R_9() { + if (jj_3R_15()) return true; + return false; + } + + final private boolean jj_3R_14() { + if (jj_scan_token(QUOTE)) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_19()) { jj_scanpos = xsp; break; } + } + while (true) { + xsp = jj_scanpos; + if (jj_3R_20()) { jj_scanpos = xsp; break; } + } + xsp = jj_scanpos; + if (jj_scan_token(9)) { + jj_scanpos = xsp; + if (jj_scan_token(0)) return true; + } + return false; + } + + final private boolean jj_3R_24() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(15)) { + jj_scanpos = xsp; + if (jj_3R_27()) return true; + } + return false; + } + + final private boolean jj_3R_26() { + if (jj_3R_16()) return true; + return false; + } + + final private boolean jj_3R_21() { + Token xsp; + if (jj_3R_26()) return true; + while (true) { + xsp = jj_scanpos; + if (jj_3R_26()) { jj_scanpos = xsp; break; } + } + if (jj_3R_11()) return true; + return false; + } + + final private boolean jj_3R_22() { + if (jj_3R_17()) return true; + return false; + } + + final private boolean jj_3R_8() { + if (jj_3R_14()) return true; + return false; + } + + final private boolean jj_3R_12() { + if (jj_3R_17()) return true; + return false; + } + + final private boolean jj_3R_11() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(1)) { + jj_scanpos = xsp; + if (jj_scan_token(2)) { + jj_scanpos = xsp; + if (jj_scan_token(3)) return true; + } + } + return false; + } + + final private boolean jj_3R_15() { + if (jj_3R_11()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_21()) { jj_scanpos = xsp; break; } + } + return false; + } + + final private boolean jj_3R_17() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(10)) { + jj_scanpos = xsp; + if (jj_scan_token(11)) { + jj_scanpos = xsp; + if (jj_scan_token(12)) { + jj_scanpos = xsp; + if (jj_scan_token(13)) { + jj_scanpos = xsp; + if (jj_scan_token(14)) return true; + } + } + } + } + return false; + } + + public NutchAnalysisTokenManager token_source; + public Token token, jj_nt; + private int jj_ntk; + private Token jj_scanpos, jj_lastpos; + private int jj_la; + public boolean lookingAhead = false; + private boolean jj_semLA; + private int jj_gen; + final private int[] jj_la1 = new int[16]; + static private int[] jj_la1_0; + static { + jj_la1_0(); + } + private static void jj_la1_0() { + jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,}; + } + final private JJCalls[] jj_2_rtns = new JJCalls[3]; + private boolean jj_rescan = false; + private int jj_gc = 0; + + public NutchAnalysis(CharStream stream) { + token_source = new NutchAnalysisTokenManager(stream); + token = new Token(); + jj_ntk = -1; + jj_gen = 0; + for (int i = 0; i < 16; i++) jj_la1[i] = -1; + for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); + } + + public void ReInit(CharStream stream) { + token_source.ReInit(stream); + token = new Token(); + jj_ntk = -1; + jj_gen = 0; + for (int i = 0; i < 16; i++) jj_la1[i] = -1; + for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); + } + + public NutchAnalysis(NutchAnalysisTokenManager tm) { + token_source = tm; + token = new Token(); + jj_ntk = -1; + jj_gen = 0; + for (int i = 0; i < 16; i++) jj_la1[i] = -1; + for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); + } + + public void ReInit(NutchAnalysisTokenManager tm) { + token_source = tm; + token = new Token(); + jj_ntk = -1; + jj_gen = 0; + for (int i = 0; i < 16; i++) jj_la1[i] = -1; + for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); + } + + final private Token jj_consume_token(int kind) throws ParseException { + Token oldToken; + if ((oldToken = token).next != null) token = token.next; + else token = token.next = token_source.getNextToken(); + jj_ntk = -1; + if (token.kind == kind) { + jj_gen++; + if (++jj_gc > 100) { + jj_gc = 0; + for (int i = 0; i < jj_2_rtns.length; i++) { + JJCalls c = jj_2_rtns[i]; + while (c != null) { + if (c.gen < jj_gen) c.first = null; + c = c.next; + } + } + } + return token; + } + token = oldToken; + jj_kind = kind; + throw generateParseException(); + } + + static private final class LookaheadSuccess extends java.lang.Error { } + final private LookaheadSuccess jj_ls = new LookaheadSuccess(); + final private boolean jj_scan_token(int kind) { + if (jj_scanpos == jj_lastpos) { + jj_la--; + if (jj_scanpos.next == null) { + jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); + } else { + jj_lastpos = jj_scanpos = jj_scanpos.next; + } + } else { + jj_scanpos = jj_scanpos.next; + } + if (jj_rescan) { + int i = 0; Token tok = token; + while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; } + if (tok != null) jj_add_error_token(kind, i); + } + if (jj_scanpos.kind != kind) return true; + if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls; + return false; + } + + final public Token getNextToken() { + if (token.next != null) token = token.next; + else token = token.next = token_source.getNextToken(); + jj_ntk = -1; + jj_gen++; + return token; + } + + final public Token getToken(int index) { + Token t = lookingAhead ? jj_scanpos : token; + for (int i = 0; i < index; i++) { + if (t.next != null) t = t.next; + else t = t.next = token_source.getNextToken(); + } + return t; + } + + final private int jj_ntk() { + if ((jj_nt=token.next) == null) + return (jj_ntk = (token.next=token_source.getNextToken()).kind); + else + return (jj_ntk = jj_nt.kind); + } + + private java.util.Vector jj_expentries = new java.util.Vector(); + private int[] jj_expentry; + private int jj_kind = -1; + private int[] jj_lasttokens = new int[100]; + private int jj_endpos; + + private void jj_add_error_token(int kind, int pos) { + if (pos >= 100) return; + if (pos == jj_endpos + 1) { + jj_lasttokens[jj_endpos++] = kind; + } else if (jj_endpos != 0) { + jj_expentry = new int[jj_endpos]; + for (int i = 0; i < jj_endpos; i++) { + jj_expentry[i] = jj_lasttokens[i]; + } + boolean exists = false; + for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) { + int[] oldentry = (int[])(e.nextElement()); + if (oldentry.length == jj_expentry.length) { + exists = true; + for (int i = 0; i < jj_expentry.length; i++) { + if (oldentry[i] != jj_expentry[i]) { + exists = false; + break; + } + } + if (exists) break; + } + } + if (!exists) jj_expentries.addElement(jj_expentry); + if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind; + } + } + + public ParseException generateParseException() { + jj_expentries.removeAllElements(); + boolean[] la1tokens = new boolean[20]; + for (int i = 0; i < 20; i++) { + la1tokens[i] = false; + } + if (jj_kind >= 0) { + la1tokens[jj_kind] = true; + jj_kind = -1; + } + for (int i = 0; i < 16; i++) { + if (jj_la1[i] == jj_gen) { + for (int j = 0; j < 32; j++) { + if ((jj_la1_0[i] & (1< jj_gen) { + jj_la = p.arg; jj_lastpos = jj_scanpos = p.first; + switch (i) { + case 0: jj_3_1(); break; + case 1: jj_3_2(); break; + case 2: jj_3_3(); break; + } + } + p = p.next; + } while (p != null); + } catch(LookaheadSuccess ls) { } + } + jj_rescan = false; + } + + final private void jj_save(int index, int xla) { + JJCalls p = jj_2_rtns[index]; + while (p.gen > jj_gen) { + if (p.next == null) { p = p.next = new JJCalls(); break; } + p = p.next; + } + p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla; + } + + static final class JJCalls { + int gen; + Token first; + int arg; + JJCalls next; + } + +} diff -rBNu src.orig/java/org/apache/nutch/analysis/NutchAnalysis.jj src/java/org/apache/nutch/analysis/NutchAnalysis.jj --- src.orig/java/org/apache/nutch/analysis/NutchAnalysis.jj 2009-03-10 11:34:01.000000000 -0700 +++ src/java/org/apache/nutch/analysis/NutchAnalysis.jj 2009-03-10 14:11:44.000000000 -0700 @@ -84,7 +84,11 @@ queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); parser.queryString = queryString; parser.queryFilters = new QueryFilters(conf); - return parser.parse(conf); + try { + return parser.parse(conf); + } catch (ParseException pe) { + throw new IOException("Parse exception: " + pe); + } } /** For debugging. */ @@ -112,8 +116,9 @@ TOKEN : { // token regular expressions + // basic word -- lowercase it -||)+ | )> +| ||)+ | )> { matchedToken.image = matchedToken.image.toLowerCase(); } // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed @@ -140,6 +145,8 @@ | | | +| +| | | | @@ -203,20 +210,21 @@ String field; boolean stop; boolean prohibited; + boolean required; } { nonOpOrTerm() // skip noise ( - { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; } + { stop=true; prohibited=false; required=true; field = Clause.DEFAULT_FIELD; } - // optional + or - operator - ( {stop=false;} | ( { stop=false;prohibited=true; } ))? + // optional OR, + or - + (disjunction() { query.setLastTermNotRequired(); required=false;} | {stop=false;required=true;} | ( { stop=false;prohibited=true; } ) | )* // optional field spec. ( LOOKAHEAD((phrase(field)|compound(field))) token= { field = token.image; } )? - + ( terms=phrase(field) {stop=false;} | // quoted terms or terms=compound(field)) // single or compound term @@ -233,8 +241,11 @@ } else { if (prohibited) query.addProhibitedPhrase(array, field); - else + else if (required) query.addRequiredPhrase(array, field); + else { + query.addNormalPhrase(array, field); + } } } )* @@ -243,7 +254,7 @@ } -/** Parse an explcitly quoted phrase query. Note that this may return a single +/** Parse an explicitly quoted phrase query. Note that this may return a single * term, a trivial phrase.*/ ArrayList phrase(String field) : { @@ -277,7 +288,7 @@ /** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters. Note that - * htis may return a single term, a trivial compound. */ + * this may return a single term, a trivial compound. */ ArrayList compound(String field) : { int start; @@ -348,13 +359,19 @@ | infix() } +void disjunction() : +{} +{ + +} + void nonTermOrEOF() : {} { nonTerm() | } -/** Parse anything but a term or an operator (plur or minus or quote). */ +/** Parse anything but a term or an operator (plus or minus or quote). */ void nonOpOrTerm() : {} { diff -rBNu src.orig/java/org/apache/nutch/analysis/NutchAnalysis.jj~ src/java/org/apache/nutch/analysis/NutchAnalysis.jj~ --- src.orig/java/org/apache/nutch/analysis/NutchAnalysis.jj~ 1969-12-31 16:00:00.000000000 -0800 +++ src/java/org/apache/nutch/analysis/NutchAnalysis.jj~ 2009-03-10 12:18:08.000000000 -0700 @@ -0,0 +1,393 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** JavaCC code for the Nutch lexical analyzer. */ + +options { + STATIC = false; + USER_CHAR_STREAM = true; + OPTIMIZE_TOKEN_MANAGER = true; + UNICODE_INPUT = true; +//DEBUG_TOKEN_MANAGER = true; +} + +PARSER_BEGIN(NutchAnalysis) + +package org.apache.nutch.analysis; + +import java.io.StringReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.QueryFilters; +import org.apache.nutch.searcher.Query.Clause; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.*; +import java.util.*; + +/** The JavaCC-generated Nutch lexical analyzer and query parser. */ +public class NutchAnalysis { + + private static final String[] STOP_WORDS = { + "a", "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "s", "such", + "t", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with" + }; + + private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); + + private Analyzer analyzer = null; + private String queryString; + private QueryFilters queryFilters; + + + /** Constructs a nutch analysis. */ + public NutchAnalysis(String query, Analyzer analyzer) { + this(new FastCharStream(new StringReader(query))); + this.analyzer = analyzer; + } + + /** True iff word is a stop word. Stop words are only removed from queries. + * Every word is indexed. */ + public static boolean isStopWord(String word) { + return STOP_SET.contains(word); + } + + /** Construct a query parser for the text in a reader. */ + public static Query parseQuery(String queryString, Configuration conf) throws IOException { + return parseQuery(queryString, null, conf); + } + + /** Construct a query parser for the text in a reader. */ + public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf) + throws IOException { + NutchAnalysis parser = new NutchAnalysis( + queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); + parser.queryString = queryString; + parser.queryFilters = new QueryFilters(conf); + try { + return parser.parse(conf); + } catch (ParseException pe) { + throw new IOException("Parse exception: " + pe); + } + } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + while (true) { + System.out.print("Query: "); + String line = in.readLine(); + System.out.println(parseQuery(line, NutchConfiguration.create())); + } + } + +} + +PARSER_END(NutchAnalysis) + +TOKEN_MGR_DECLS : { + + /** Constructs a token manager for the provided Reader. */ + public NutchAnalysisTokenManager(Reader reader) { + this(new FastCharStream(reader)); + } + +} + +TOKEN : { // token regular expressions + + + // basic word -- lowercase it +| ||)+ | )> + { matchedToken.image = matchedToken.image.toLowerCase(); } + + // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed +| "." ( ".")+ > + { // remove dots + for (int i = 0; i < image.length(); i++) { + if (image.charAt(i) == '.') + image.deleteCharAt(i--); + } + matchedToken.image = image.toString().toLowerCase(); + } + + // chinese, japanese and korean characters +| > + + // irregular words +| <#IRREGULAR_WORD: (|)> +| <#C_PLUS_PLUS: ("C"|"c") "++" > +| <#C_SHARP: ("C"|"c") "#" > + + // query syntax characters +| +| +| +| +| +| +| +| +| +| + +| // treat unrecognized chars + // as whitespace +// primitive, non-token patterns + +| <#WORD_PUNCT: ("_"|"&")> // allowed anywhere in words + +| < #LETTER: // alphabets + [ + "\u0041"-"\u005a", + "\u0061"-"\u007a", + "\u00c0"-"\u00d6", + "\u00d8"-"\u00f6", + "\u00f8"-"\u00ff", + "\u0100"-"\u1fff" + ] + > + +| <#CJK: // non-alphabets + [ + "\u3040"-"\u318f", + "\u3300"-"\u337f", + "\u3400"-"\u3d2d", + "\u4e00"-"\u9fff", + "\uf900"-"\ufaff" + ] + > + +| < #DIGIT: // unicode digits + [ + "\u0030"-"\u0039", + "\u0660"-"\u0669", + "\u06f0"-"\u06f9", + "\u0966"-"\u096f", + "\u09e6"-"\u09ef", + "\u0a66"-"\u0a6f", + "\u0ae6"-"\u0aef", + "\u0b66"-"\u0b6f", + "\u0be7"-"\u0bef", + "\u0c66"-"\u0c6f", + "\u0ce6"-"\u0cef", + "\u0d66"-"\u0d6f", + "\u0e50"-"\u0e59", + "\u0ed0"-"\u0ed9", + "\u1040"-"\u1049" + ] + > + +} + + +/** Parse a query. */ +Query parse(Configuration conf) : +{ + Query query = new Query(conf); + ArrayList terms; + Token token; + String field; + boolean stop; + boolean prohibited; + boolean required; + +} +{ + nonOpOrTerm() // skip noise + ( + { stop=true; prohibited=false; required=true; field = Clause.DEFAULT_FIELD; } + + // optional + or - operator + (disjunction() {required=false;} | {stop=false;required=true;} | ( { stop=false;prohibited=true; } ))? + + // optional field spec. + ( LOOKAHEAD((phrase(field)|compound(field))) + token= { field = token.image; } )? + + ( terms=phrase(field) {stop=false;} | // quoted terms or + terms=compound(field)) // single or compound term + + nonOpOrTerm() // skip noise + + { + String[] array = (String[])terms.toArray(new String[terms.size()]); + + if (stop + && field == Clause.DEFAULT_FIELD + && terms.size()==1 + && isStopWord(array[0])) { + // ignore stop words only when single, unadorned terms in default field + } else { + if (prohibited) + query.addProhibitedPhrase(array, field); + else if (required) + query.addRequiredPhrase(array, field); + else + query.addNormalPhrase(array, field); + } + } + )* + + { return query; } + +} + +/** Parse an explicitly quoted phrase query. Note that this may return a single + * term, a trivial phrase.*/ +ArrayList phrase(String field) : +{ + int start; + int end; + ArrayList result = new ArrayList(); + String term; +} +{ + + + { start = token.endColumn; } + + (nonTerm())* // skip noise + ( term = term() { result.add(term); } // parse a term + (nonTerm())*)* // skip noise + + { end = token.endColumn; } + + (|) + + { + if (this.queryFilters.isRawField(field)) { + result.clear(); + result.add(queryString.substring(start, end)); + } + return result; + } + +} + +/** Parse a compound term that is interpreted as an implicit phrase query. + * Compounds are a sequence of terms separated by infix characters. Note that + * this may return a single term, a trivial compound. */ +ArrayList compound(String field) : +{ + int start; + ArrayList result = new ArrayList(); + String term; + StringBuffer terms = new StringBuffer(); +} +{ + { start = token.endColumn; } + + term = term() { + terms.append(term).append(" "); + //result.add(term); + } + ( LOOKAHEAD( (infix())+ term() ) + (infix())+ + term = term() { + terms.append(term).append(" "); + //result.add(term); + })* + + { + if (this.queryFilters.isRawField(field)) { +// result.clear(); + result.add(queryString.substring(start, token.endColumn)); + + } else { + org.apache.lucene.analysis.Token token; + TokenStream tokens = analyzer.tokenStream( + field, new StringReader(terms.toString())); + + while (true) { + try { + token = tokens.next(); + } catch (IOException e) { + token = null; + } + if (token == null) { break; } + result.add(token.termText()); + } + try { + tokens.close(); + } catch (IOException e) { + // ignore + } + } + return result; + } + +} + +/** Parse a single term. */ +String term() : +{ + Token token; +} +{ + ( token= | token= | token=) + + { return token.image; } +} + + +/** Parse anything but a term or a quote. */ +void nonTerm() : +{} +{ + | infix() +} + +void disjunction() : +{} +{ + nonOpOrTerm() +} + +void nonTermOrEOF() : +{} +{ + nonTerm() | +} + +/** Parse anything but a term or an operator (plus or minus or quote). */ +void nonOpOrTerm() : +{} +{ + (LOOKAHEAD(2) ( | nonOpInfix() | ((|) nonTermOrEOF())))* +} + +/** Characters which can be used to form compound terms. */ +void infix() : +{} +{ + | | nonOpInfix() +} + +/** Parse infix characters except plus and minus. */ +void nonOpInfix() : +{} +{ + |||| +} + diff -rBNu src.orig/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java --- src.orig/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java 2009-03-10 11:34:01.000000000 -0700 +++ src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java 2009-03-10 14:11:55.000000000 -0700 @@ -24,6 +24,13 @@ { switch (pos) { + case 0: + if ((active0 & 0x2L) != 0L) + { + jjmatchedKind = 2; + return 2; + } + return -1; default : return -1; } @@ -51,25 +58,49 @@ switch(curChar) { case 34: - return jjStopAtPos(0, 9); + return jjStopAtPos(0, 10); case 39: + return jjStopAtPos(0, 17); + case 40: + return jjStopAtPos(0, 13); + case 41: return jjStopAtPos(0, 14); case 43: - return jjStopAtPos(0, 7); - case 45: return jjStopAtPos(0, 8); + case 45: + return jjStopAtPos(0, 9); case 46: - return jjStopAtPos(0, 12); + return jjStopAtPos(0, 15); case 47: - return jjStopAtPos(0, 11); + return jjStopAtPos(0, 12); case 58: - return jjStopAtPos(0, 10); + return jjStopAtPos(0, 11); case 64: - return jjStopAtPos(0, 13); + return jjStopAtPos(0, 16); + case 79: + return jjMoveStringLiteralDfa1_0(0x2L); default : return jjMoveNfa_0(1, 0); } } +private final int jjMoveStringLiteralDfa1_0(long active0) +{ + try { curChar = input_stream.readChar(); } + catch(java.io.IOException e) { + jjStopStringLiteralDfa_0(0, active0); + return 1; + } + switch(curChar) + { + case 82: + if ((active0 & 0x2L) != 0L) + return jjStartNfaWithStates_0(1, 1, 0); + break; + default : + break; + } + return jjStartNfa_0(0, active0); +} private final void jjCheckNAdd(int state) { if (jjrounds[state] != jjround) @@ -144,27 +175,33 @@ case 0: if ((0x3ff004000000000L & l) == 0L) break; - kind = 1; + kind = 2; jjCheckNAdd(0); break; case 2: - if (curChar == 46) + if ((0x3ff004000000000L & l) != 0L) + { + if (kind > 2) + kind = 2; + jjCheckNAdd(0); + } + else if (curChar == 46) jjCheckNAdd(3); break; case 4: if (curChar != 46) break; - if (kind > 2) - kind = 2; + if (kind > 3) + kind = 3; jjCheckNAdd(3); break; case 7: if (curChar == 35) - kind = 1; + kind = 2; break; case 8: - if (curChar == 43 && kind > 1) - kind = 1; + if (curChar == 43 && kind > 2) + kind = 2; break; case 9: if (curChar == 43) @@ -184,8 +221,8 @@ case 1: if ((0x7fffffe87fffffeL & l) != 0L) { - if (kind > 1) - kind = 1; + if (kind > 2) + kind = 2; jjCheckNAdd(0); } if ((0x7fffffe07fffffeL & l) != 0L) @@ -193,11 +230,12 @@ if ((0x800000008L & l) != 0L) jjAddStates(0, 1); break; + case 2: case 0: if ((0x7fffffe87fffffeL & l) == 0L) break; - if (kind > 1) - kind = 1; + if (kind > 2) + kind = 2; jjCheckNAdd(0); break; case 3: @@ -226,23 +264,24 @@ case 1: if (jjCanMove_0(hiByte, i1, i2, l1, l2)) { - if (kind > 1) - kind = 1; + if (kind > 2) + kind = 2; jjCheckNAdd(0); } if (jjCanMove_0(hiByte, i1, i2, l1, l2)) jjstateSet[jjnewStateCnt++] = 2; if (jjCanMove_1(hiByte, i1, i2, l1, l2)) { - if (kind > 3) - kind = 3; + if (kind > 4) + kind = 4; } break; + case 2: case 0: if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; - if (kind > 1) - kind = 1; + if (kind > 2) + kind = 2; jjCheckNAdd(0); break; case 3: @@ -250,8 +289,8 @@ jjstateSet[jjnewStateCnt++] = 4; break; case 5: - if (jjCanMove_1(hiByte, i1, i2, l1, l2) && kind > 3) - kind = 3; + if (jjCanMove_1(hiByte, i1, i2, l1, l2) && kind > 4) + kind = 4; break; default : break; } @@ -304,8 +343,8 @@ } } public static final String[] jjstrLiteralImages = { -"", null, null, null, null, null, null, "\53", "\55", "\42", "\72", "\57", -"\56", "\100", "\47", null, null, null, null, null, }; +"", "\117\122", null, null, null, null, null, null, "\53", "\55", "\42", +"\72", "\57", "\50", "\51", "\56", "\100", "\47", null, null, null, null, null, }; public static final String[] lexStateNames = { "DEFAULT", }; @@ -396,9 +435,9 @@ jjmatchedKind = 0x7fffffff; jjmatchedPos = 0; curPos = jjMoveStringLiteralDfa0_0(); - if (jjmatchedPos == 0 && jjmatchedKind > 15) + if (jjmatchedPos == 0 && jjmatchedKind > 18) { - jjmatchedKind = 15; + jjmatchedKind = 18; } if (jjmatchedKind != 0x7fffffff) { @@ -435,13 +474,13 @@ { switch(jjmatchedKind) { - case 1 : + case 2 : if (image == null) image = new StringBuffer(); image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1))); matchedToken.image = matchedToken.image.toLowerCase(); break; - case 2 : + case 3 : if (image == null) image = new StringBuffer(); image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1))); diff -rBNu src.orig/java/org/apache/nutch/searcher/Query.java src/java/org/apache/nutch/searcher/Query.java --- src.orig/java/org/apache/nutch/searcher/Query.java 2009-03-10 11:34:01.000000000 -0700 +++ src/java/org/apache/nutch/searcher/Query.java 2009-03-10 13:19:40.000000000 -0700 @@ -83,6 +83,8 @@ this.conf = conf; } + public void setIsRequired(boolean r) { isRequired = r; } + public boolean isRequired() { return isRequired; } public boolean isProhibited() { return isProhibited; } @@ -134,9 +136,9 @@ public String toString() { StringBuffer buffer = new StringBuffer(); -// if (isRequired) -// buffer.append("+"); -// else + if (isRequired) + buffer.append("+"); + else if (isProhibited) buffer.append ("-"); @@ -318,6 +320,23 @@ clauses.add(new Clause(new Term(term), field, true, false, this.conf)); } + /** Add a normal term in the default field. */ + public void addNormalTerm(String term) { + addNormalTerm(term, Clause.DEFAULT_FIELD); + } + + /** Add a required term in a specified field. */ + public void addNormalTerm(String term, String field) { + clauses.add(new Clause(new Term(term), field, false, false, this.conf)); + } + + /** Sets the previous clause to not required */ + public void setLastTermNotRequired() { + if (!clauses.isEmpty()){ + ((Clause)clauses.get(clauses.size() - 1)).setIsRequired(false); + } + } + /** Add a prohibited term in the default field. */ public void addProhibitedTerm(String term) { addProhibitedTerm(term, Clause.DEFAULT_FIELD); @@ -343,6 +362,21 @@ } } + /** Add a normal phrase in the default field. */ + public void addNormalPhrase(String[] terms) { + addNormalPhrase(terms, Clause.DEFAULT_FIELD); + } + + /** Add a normal phrase in the specified field. */ + public void addNormalPhrase(String[] terms, String field) { + if (terms.length == 0) { // ignore empty phrase + } else if (terms.length == 1) { + addNormalTerm(terms[0], field); // optimize to term query + } else { + clauses.add(new Clause(new Phrase(terms), field, false, false, this.conf)); + } + } + /** Add a prohibited phrase in the default field. */ public void addProhibitedPhrase(String[] terms) { addProhibitedPhrase(terms, Clause.DEFAULT_FIELD); diff -rBNu src.orig/java/org/apache/nutch/searcher/Query.java~ src/java/org/apache/nutch/searcher/Query.java~ --- src.orig/java/org/apache/nutch/searcher/Query.java~ 1969-12-31 16:00:00.000000000 -0800 +++ src/java/org/apache/nutch/searcher/Query.java~ 2009-03-10 12:18:16.000000000 -0700 @@ -0,0 +1,514 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.ArrayList; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.analysis.AnalyzerFactory; + +import org.apache.nutch.analysis.NutchAnalysis; +import org.apache.nutch.util.NutchConfiguration; + +/** A Nutch query. */ +public final class Query implements Writable, Cloneable, Configurable { + public static final Log LOG = LogFactory.getLog(Query.class); + + /** A query clause. */ + public static class Clause implements Cloneable { + public static final String DEFAULT_FIELD = "DEFAULT"; + + private static final byte REQUIRED_BIT = 1; + private static final byte PROHIBITED_BIT = 2; + private static final byte PHRASE_BIT = 4; + + private boolean isRequired; + private boolean isProhibited; + private String field = DEFAULT_FIELD; + private float weight = 1.0f; + private Object termOrPhrase; + + private Configuration conf; + + public Clause(Term term, String field, + boolean isRequired, boolean isProhibited, Configuration conf) { + this(term, isRequired, isProhibited, conf); + this.field = field; + } + + public Clause(Term term, boolean isRequired, boolean isProhibited, Configuration conf) { + this.isRequired = isRequired; + this.isProhibited = isProhibited; + this.termOrPhrase = term; + this.conf = conf; + } + + public Clause(Phrase phrase, String field, + boolean isRequired, boolean isProhibited, Configuration conf) { + this(phrase, isRequired, isProhibited, conf); + this.field = field; + } + + public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, Configuration conf) { + this.isRequired = isRequired; + this.isProhibited = isProhibited; + this.termOrPhrase = phrase; + this.conf = conf; + } + + public boolean isRequired() { return isRequired; } + public boolean isProhibited() { return isProhibited; } + + public String getField() { return field; } + + public float getWeight() { return weight; } + public void setWeight(float weight) { this.weight = weight; } + + public boolean isPhrase() { return termOrPhrase instanceof Phrase; } + + public Phrase getPhrase() { return (Phrase)termOrPhrase; } + public Term getTerm() { return (Term)termOrPhrase; } + + public void write(DataOutput out) throws IOException { + byte bits = 0; + if (isPhrase()) + bits |= PHRASE_BIT; + if (isRequired) + bits |= REQUIRED_BIT; + if (isProhibited) + bits |= PROHIBITED_BIT; + out.writeByte(bits); + out.writeUTF(field); + out.writeFloat(weight); + + if (isPhrase()) + getPhrase().write(out); + else + getTerm().write(out); + } + + public static Clause read(DataInput in, Configuration conf) throws IOException { + byte bits = in.readByte(); + boolean required = ((bits & REQUIRED_BIT) != 0); + boolean prohibited = ((bits & PROHIBITED_BIT) != 0); + + String field = in.readUTF(); + float weight = in.readFloat(); + + Clause clause; + if ((bits & PHRASE_BIT) == 0) { + clause = new Clause(Term.read(in), field, required, prohibited, conf); + } else { + clause = new Clause(Phrase.read(in), field, required, prohibited, conf); + } + clause.weight = weight; + return clause; + } + + public String toString() { + StringBuffer buffer = new StringBuffer(); + if (isRequired) + buffer.append("+"); + else + if (isProhibited) + buffer.append ("-"); + + if (!DEFAULT_FIELD.equals(field)) { + buffer.append(field); + buffer.append(":"); + } + + if (!isPhrase() && new QueryFilters(conf).isRawField(field)) { + buffer.append('"'); // quote raw terms + buffer.append(termOrPhrase.toString()); + buffer.append('"'); + } else { + buffer.append(termOrPhrase.toString()); + } + + return buffer.toString(); + } + + public boolean equals(Object o) { + if (!(o instanceof Clause)) return false; + Clause other = (Clause)o; + return + (this.isRequired == other.isRequired) && + (this.isProhibited == other.isProhibited) && + (this.weight == other.weight) && + (this.termOrPhrase == null ? other.termOrPhrase == null : + this.termOrPhrase.equals(other.termOrPhrase)); + } + + public int hashCode() { + return + (this.isRequired ? 0 : 1) ^ + (this.isProhibited ? 2 : 4) ^ + Float.floatToIntBits(this.weight) ^ + (this.termOrPhrase != null ? termOrPhrase.hashCode() : 0); + } + + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + } + + /** A single-term query clause. */ + public static class Term { + private String text; + + public Term(String text) { + this.text = text; + } + + public void write(DataOutput out) throws IOException { + out.writeUTF(text); + } + + public static Term read(DataInput in) throws IOException { + String text = in.readUTF(); + return new Term(text); + } + + public String toString() { + return text; + } + + public boolean equals(Object o) { + if (!(o instanceof Term)) return false; + Term other = (Term)o; + return text == null ? other.text == null : text.equals(other.text); + } + + public int hashCode() { + return text != null ? text.hashCode() : 0; + } + } + + /** A phrase query clause. */ + public static class Phrase { + private Term[] terms; + + public Phrase(Term[] terms) { + this.terms = terms; + } + + public Phrase(String[] terms) { + this.terms = new Term[terms.length]; + for (int i = 0; i < terms.length; i++) { + this.terms[i] = new Term(terms[i]); + } + } + + public Term[] getTerms() { return terms; } + + public void write(DataOutput out) throws IOException { + out.writeByte(terms.length); + for (int i = 0; i < terms.length; i++) + terms[i].write(out); + } + + public static Phrase read(DataInput in) throws IOException { + int length = in.readByte(); + Term[] terms = new Term[length]; + for (int i = 0; i < length; i++) + terms[i] = Term.read(in); + return new Phrase(terms); + } + + public String toString() { + StringBuffer buffer = new StringBuffer(); + buffer.append("\""); + for (int i = 0; i < terms.length; i++) { + buffer.append(terms[i].toString()); + if (i != terms.length-1) + buffer.append(" "); + } + buffer.append("\""); + return buffer.toString(); + } + + public boolean equals(Object o) { + if (!(o instanceof Phrase)) return false; + Phrase other = (Phrase)o; + if (!(this.terms.length == this.terms.length)) + return false; + for (int i = 0; i < terms.length; i++) { + if (!this.terms[i].equals(other.terms[i])) + return false; + } + return true; + } + + public int hashCode() { + int hashCode = terms.length; + for (int i = 0; i < terms.length; i++) { + hashCode ^= terms[i].hashCode(); + } + return hashCode; + } + + } + + + private ArrayList clauses = new ArrayList(); + + private Configuration conf; + + private static final Clause[] CLAUSES_PROTO = new Clause[0]; + + public Query() { + } + + public Query(Configuration conf) { + this.conf = conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return conf; + } + + /** Return all clauses. */ + public Clause[] getClauses() { + return (Clause[])clauses.toArray(CLAUSES_PROTO); + } + + /** Add a required term in the default field. */ + public void addRequiredTerm(String term) { + addRequiredTerm(term, Clause.DEFAULT_FIELD); + } + + /** Add a required term in a specified field. */ + public void addRequiredTerm(String term, String field) { + clauses.add(new Clause(new Term(term), field, true, false, this.conf)); + } + + /** Add a normal term in the default field. */ + public void addNormalTerm(String term) { + addNormalTerm(term, Clause.DEFAULT_FIELD); + } + + /** Add a required term in a specified field. */ + public void addNormalTerm(String term, String field) { + clauses.add(new Clause(new Term(term), field, false, false, this.conf)); + } + + /** Add a prohibited term in the default field. */ + public void addProhibitedTerm(String term) { + addProhibitedTerm(term, Clause.DEFAULT_FIELD); + } + + /** Add a prohibited term in the specified field. */ + public void addProhibitedTerm(String term, String field) { + clauses.add(new Clause(new Term(term), field, false, true, this.conf)); + } + + /** Add a required phrase in the default field. */ + public void addRequiredPhrase(String[] terms) { + addRequiredPhrase(terms, Clause.DEFAULT_FIELD); + } + + /** Add a required phrase in the specified field. */ + public void addRequiredPhrase(String[] terms, String field) { + if (terms.length == 0) { // ignore empty phrase + } else if (terms.length == 1) { + addRequiredTerm(terms[0], field); // optimize to term query + } else { + clauses.add(new Clause(new Phrase(terms), field, true, false, this.conf)); + } + } + + /** Add a normal phrase in the default field. */ + public void addNormalPhrase(String[] terms) { + addNormalPhrase(terms, Clause.DEFAULT_FIELD); + } + + /** Add a normal phrase in the specified field. */ + public void addNormalPhrase(String[] terms, String field) { + if (terms.length == 0) { // ignore empty phrase + } else if (terms.length == 1) { + addNormalTerm(terms[0], field); // optimize to term query + } else { + clauses.add(new Clause(new Phrase(terms), field, false, false, this.conf)); + } + } + + /** Add a prohibited phrase in the default field. */ + public void addProhibitedPhrase(String[] terms) { + addProhibitedPhrase(terms, Clause.DEFAULT_FIELD); + } + + /** Add a prohibited phrase in the specified field. */ + public void addProhibitedPhrase(String[] terms, String field) { + if (terms.length == 0) { // ignore empty phrase + } else if (terms.length == 1) { + addProhibitedTerm(terms[0], field); // optimize to term query + } else { + clauses.add(new Clause(new Phrase(terms), field, false, true, this.conf)); + } + } + + public void write(DataOutput out) throws IOException { + out.writeByte(clauses.size()); + for (int i = 0; i < clauses.size(); i++) + ((Clause)clauses.get(i)).write(out); + } + + public static Query read(DataInput in, Configuration conf) throws IOException { + Query result = new Query(conf); + result.readFields(in); + return result; + } + + public void readFields(DataInput in) throws IOException { + clauses.clear(); + int length = in.readByte(); + for (int i = 0; i < length; i++) + clauses.add(Clause.read(in, this.conf)); + } + + public String toString() { + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < clauses.size(); i++) { + buffer.append(clauses.get(i).toString()); + if (i != clauses.size()-1) + buffer.append(" "); + } + return buffer.toString(); + } + + public boolean equals(Object o) { + if (!(o instanceof Query)) return false; + Query other = (Query)o; + return this.clauses.equals(other.clauses); + } + + public int hashCode() { + return this.clauses.hashCode(); + } + + public Object clone() { + Query clone = null; + try { + clone = (Query)super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + clone.clauses = (ArrayList)clauses.clone(); + return clone; + } + + + /** Flattens a query into the set of text terms that it contains. These are + * terms which should be higlighted in matching documents. */ + public String[] getTerms() { + ArrayList result = new ArrayList(); + for (int i = 0; i < clauses.size(); i++) { + Clause clause = (Clause)clauses.get(i); + if (!clause.isProhibited()) { + if (clause.isPhrase()) { + Term[] terms = clause.getPhrase().getTerms(); + for (int j = 0; j < terms.length; j++) { + result.add(terms[j].toString()); + } + } else { + result.add(clause.getTerm().toString()); + } + } + } + return (String[])result.toArray(new String[result.size()]); + } + + /** + * Parse a query from a string using a language specific analyzer. + * + * @param queryString is the raw query string to parse + * @param queryLang is a two-letters language code used to identify which + * {@link org.apache.nutch.analysis.NutchAnalyzer} should be used + * to parse the query string. + * @see org.apache.nutch.analysis.AnalyzerFactory + */ + public static Query parse(String queryString, String queryLang, Configuration conf) + throws IOException { + return fixup(NutchAnalysis.parseQuery( + queryString, AnalyzerFactory.get(conf).get(queryLang), conf), conf); + } + + /** Parse a query from a string. */ + public static Query parse(String queryString, Configuration conf) throws IOException { + return parse(queryString, null, conf); + } + + /** Convert clauses in unknown fields to the default field. */ + private static Query fixup(Query input, Configuration conf) { + // walk the query + Query output = new Query(conf); + Clause[] clauses = input.getClauses(); + for (int i = 0; i < clauses.length; i++) { + Clause c = clauses[i]; + if (!new QueryFilters(conf).isField(c.getField())) { // unknown field + ArrayList terms = new ArrayList(); // add name to query + if (c.isPhrase()) { + terms.addAll(Arrays.asList(c.getPhrase().getTerms())); + } else { + terms.add(c.getTerm()); + } + terms.add(0, new Term(c.getField())); // add to front of phrase + c = (Clause)c.clone(); + c.field = Clause.DEFAULT_FIELD; // use default field instead + c.termOrPhrase + = new Phrase((Term[])terms.toArray(new Term[terms.size()])); + } + output.clauses.add(c); // copy clause to output + } + return output; + } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + Configuration conf = NutchConfiguration.create(); + while (true) { + System.out.print("Query: "); + String line = in.readLine(); + Query query = parse(line, conf); + System.out.println("Parsed: " + query); + System.out.println("Translated: " + new QueryFilters(conf).filter(query)); + } + } +}