查看分词器分出的词组
class="java" name="code">import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.Version; import org.junit.Test; public class AnalyzerTest { @Test public void analyzer() throws IOException { String text = "小笑话_总统的房间 Room .txt"; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); TokenStream tokenStream = analyzer.tokenStream("name", text); OffsetAttribute attribute = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println("token: " + tokenStream.reflectAsString(true)); System.out.println("token start offset: " + attribute.startOffset()); System.out.println("token end offset: " + attribute.endOffset()); } tokenStream.end(); tokenStream.close(); } /** * 测试分词器输出 * WhitespaceAnalyzer 以空格作为切词标准,不对语汇单元进行其他规范化处理 SimpleAnalyzer 以非字母符来分割文本信息,并将语汇单元统一为小写形式,并去掉数字类型的字符 StopAnalyzer 该分析器会去除一些常有a,the,an等等,也可以自定义禁用词 StandardAnalyzer Lucene内置的标准分析器,会将语汇单元转成小写形式,并去除停用词及标点符号 CJKAnalyzer 能对中,日,韩语言进行分析的分词器,对中文支持效果一般。 SmartChineseAnalyzer 对中文支持稍好,但扩展性差 * @throws IOException */ @Test public void testCharTermAttribute () throws IOException { String text = "小笑话_总统的房间 Room .txt"; //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); //Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_4_9); Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_4_9); TokenStream tokenStream = analyzer.tokenStream("name", text); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(termAtt.toString()); } tokenStream.end(); tokenStream.close(); } }
?