jieba.go 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. package tokenizer
  2. import (
  3. "errors"
  4. "github.com/blevesearch/bleve/analysis"
  5. "github.com/blevesearch/bleve/registry"
  6. "github.com/yanyiwu/gojieba"
  7. )
  8. type JiebaTokenizer struct {
  9. handle *gojieba.Jieba
  10. }
  11. func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer {
  12. x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words)
  13. return &JiebaTokenizer{x}
  14. }
  15. func (x *JiebaTokenizer) Free() {
  16. x.handle.Free()
  17. }
  18. func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
  19. result := make(analysis.TokenStream, 0)
  20. pos := 1
  21. words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, false)
  22. for _, word := range words {
  23. token := analysis.Token{
  24. Term: []byte(word.Str),
  25. Start: word.Start,
  26. End: word.End,
  27. Position: pos,
  28. Type: analysis.Ideographic,
  29. }
  30. result = append(result, &token)
  31. pos++
  32. }
  33. return result
  34. }
  35. func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
  36. dictpath, ok := config["dictpath"].(string)
  37. if !ok {
  38. return nil, errors.New("config dictpath not found")
  39. }
  40. hmmpath, ok := config["hmmpath"].(string)
  41. if !ok {
  42. return nil, errors.New("config hmmpath not found")
  43. }
  44. userdictpath, ok := config["userdictpath"].(string)
  45. if !ok {
  46. return nil, errors.New("config userdictpath not found")
  47. }
  48. idf, ok := config["idf"].(string)
  49. if !ok {
  50. return nil, errors.New("config idf not found")
  51. }
  52. stop_words, ok := config["stop_words"].(string)
  53. if !ok {
  54. return nil, errors.New("config stop_words not found")
  55. }
  56. return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil
  57. }
  58. func init() {
  59. registry.RegisterTokenizer("gojieba", tokenizerConstructor)
  60. }