南昌大学|中正论坛

 找回密码
 注册[30秒完成]
搜索
查看: 6108|回复: 0

关于C#在lucene.net下的中文切词

[复制链接]
发表于 2008-9-9 01:25:51 | 显示全部楼层 |阅读模式
经过一天的研究,终于完成了C#在lucene.net下可以使用的中文切词方法。感到有些复杂,不过我还是拿下了。颇有点成就感的,发上来跟大家分享一下!   在实现了中文切词的基础方法上,我将其封装在继承lucene的Analyzer类下
  chineseAnalzer的方法就不用多说了。
以下是引用片段:
  using System;
  using System.Collections.Generic;
  using System.Text;
  
  using Lucene.Net.Analysis;
  using Lucene.Net.Analysis.Standard;
  
  namespace Lucene.Fanswo
  {
   /**////  
   ///
   ///  
   public class ChineseAnalyzer:Analyzer
   {
   //private System.Collections.Hashtable stopSet;
   public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们" };
  
  
   /**//// Constructs a {@link StandardTokenizer} filtered by a {@link
   /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
   ///  
   public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
   {
   TokenStream result = new ChineseTokenizer(reader);
   result = new StandardFilter(result);
   result = new LowerCaseFilter(result);
   result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
   return result;
   }
  
   }
  }
  
  ChineseTokenizer类的实现:
  这里通过词典来正向匹配字符,返回lucene下定义的token流

以下是引用片段:
  using System;
  using System.Collections.Generic;
  using System.Text;
  using Lucene.Net.Analysis;
  using System.Collections;
  using System.Text.RegularExpressions;
  using System.IO;
  
  namespace Lucene.Fanswo
  {
   class ChineseTokenizer : Tokenizer
   {
  
   private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度
  
   private int start;//开始位置
   /**////  
   /// 存在字符内容
   ///  
   private string text;
  
   /**////  
   /// 切词所花费的时间
   ///  
   public double TextSeg_Span = 0;
  
   /**//// Constructs a tokenizer for this Reader.  
   public ChineseTokenizer(System.IO.TextReader reader)
   {
   this.input = reader;
   text = input.ReadToEnd();
   dataLen = text.Length;
   }
  
   /**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null
   ///  
   ///
   public override Token Next()
   {
   Token token = null;
   WordTree tree = new WordTree();
   //读取词库
   tree.LoadDict();
   //初始化词库,为树形
   Hashtable t_chartable = WordTree.chartable;
   string ReWord = "";
   string char_s;
   start = offset;
   bufferIndex = start;
  
   while (true)
   {
   //开始位置超过字符长度退出循环
   if (start >= dataLen)
   {
   break;
   }
   //获取一个词
   char_s = text.Substring(start, 1);
   if (string.IsNullOrEmpty(char_s.Trim()))
   {
   start++;
   continue;
   }
   //字符不在字典中
   if (!t_chartable.Contains(char_s))
   {
   if (ReWord == "")
   {
   int j = start + 1;
   switch (tree.GetCharType(char_s))
   {
   case 0://中文单词
   ReWord += char_s;
   break;
   case 1://英文单词
   j = start + 1;
   while (j < dataLen)
   {
   if (tree.GetCharType(text.Substring(j, 1)) != 1)
   break;
  
   j++;
   }
   ReWord += text.Substring(start, j - offset);
  
   break;
   case 2://数字
   j = start + 1;
   while (j < dataLen)
   {
   if (tree.GetCharType(text.Substring(j, 1)) != 2)
   break;
  
   j++;
   }
   ReWord += text.Substring(start, j - offset);
  
   break;
  
   default:
   ReWord += char_s;//其他字符单词
   break;
   }
  
   offset = j;//设置取下一个词的开始位置
   }
   else
   {
   offset = start;//设置取下一个词的开始位置
   }
  
   //返回token对象
   return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
   }
   //字符在字典中
   ReWord += char_s;
   //取得属于当前字符的词典树
   t_chartable = (Hashtable)t_chartable[char_s];
   //设置下一循环取下一个词的开始位置
   start++;
   if (start == dataLen)
   {
   offset = dataLen;
   return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
   }
   }
   return token;
   }
  
   }
  }

  测试的代码:
以下是引用片段:
  using System;
  using System.Collections.Generic;
  using System.Text;
  
  using Analyzer = Lucene.Net.Analysis.Analyzer;
  using SimpleAnalyzer = Lucene.Net.Analysis.SimpleAnalyzer;
  using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
  using Token = Lucene.Net.Analysis.Token;
  using TokenStream = Lucene.Net.Analysis.TokenStream;
  
  namespace MyLuceneTest
  {
   class Program
   {
   [STAThread]
   public static void Main(System.String[] args)
   {
   try
   {
   Test("中华人民共和国在1949年建立,从此开始了新中国的伟大篇章。长春市长春节致词", true);
   }
   catch (System.Exception e)
   {
   System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message + e.ToString());
   }
   }
  
   internal static void Test(System.String text, bool verbose)
   {
   System.Console.Out.WriteLine(" Tokenizing string: " + text);
   Test(new System.IO.StringReader(text), verbose, text.Length);
   }
  
   internal static void Test(System.IO.TextReader reader, bool verbose, long bytes)
   {
   //Analyzer analyzer = new StandardAnalyzer();
   Analyzer analyzer = new Lucene.Fanswo.ChineseAnalyzer();
   TokenStream stream = analyzer.TokenStream(null, reader);
  
   System.DateTime start = System.DateTime.Now;
  
   int count = 0;
   for (Token t = stream.Next(); t != null; t = stream.Next())
   {
   if (verbose)
   {
   System.Console.Out.WriteLine("Token=" + t.ToString());
   }
   count++;
   }
  
   System.DateTime end = System.DateTime.Now;
  
   long time = end.Ticks - start.Ticks;
   System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
   System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
   System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
   }
   }
  }
  
  测试结果:
  
  完毕!
  分词的郊率上还有待在算法上提高。还有中文的标点符号没有处理,我将进一步完善。
  本人文采不好,写不出很多文字,只有以代码代替一下我的言语。兄弟姐妹们给点意见哦。谢谢!
您需要登录后才可以回帖 登录 | 注册[30秒完成]

本版积分规则

手机访问本页请
扫描左边二维码
         本网站声明
本网站所有内容为网友上传,若存在版权问题或是相关责任请联系站长!
站长联系QQ:7123767   myubbs.com
         站长微信:7123767
请扫描右边二维码
www.myubbs.com

小黑屋|手机版|Archiver|南昌大学论坛 ( 琼ICP备10001196号-2 )

GMT+8, 2024-3-28 17:54 , Processed in 1.185120 second(s), 16 queries .

Powered by 高考信息网 X3.3

© 2001-2013 大学排名

快速回复 返回顶部 返回列表