There is a very simple hack to incorporate word order in an existing bag-of-words model implementation. Treat some of the phrases, such as the frequently occurring bi-grams (e.g. New York) as a unit, i.e. a single word instead of treating them as separate entities. This will ensure that "New York" is different from "York New". You could also define higher order word shingles such as for n=3,4 etc.
You could use the Lucene ShingleFilter to decompose your document text into shingles as a pre-processing step and then apply the classifier on this decomposed text.
import java.io.*;
import org.apache.lucene.analysis.core.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.util.*;
import org.apache.lucene.analysis.util.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.charfilter.*;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
class TestAnalyzer extends Analyzer {
TestAnalyzer() {
super();
}
protected TokenStreamComponents createComponents( String fieldName, Reader reader ) {
String token;
TokenStream result = null;
Tokenizer source = new WhitespaceTokenizer( Version.LUCENE_CURRENT, reader );
result = new ShingleFilter(source, 2, 2);
return new TokenStreamComponents( source, result );
}
}
public class LuceneTest {
public static void main(String[] args) throws Exception {
TestAnalyzer analyzer = new TestAnalyzer();
try {
TokenStream stream = analyzer.tokenStream("field", new StringReader("This is a sample sentence."));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
stream.reset();
// print all tokens until stream is exhausted
while (stream.incrementToken()) {
System.out.println(termAtt.toString());
}
stream.end();
stream.close();
}
catch (Exception ex) {
ex.printStackTrace();
}
}
}