基本思想
从处理文档的最末端开始匹配扫描,每次取最末端的i个字符,其中i是字典最长的词数,若匹配失败,则去掉最前面的第一个字,继续匹配。
python实现
RMM类
"""
@author: Alex
@contact: 1272296763@qq.com or jakinmili@gmail.com
@file: Re_max_Match.py
@time: 2019/9/14 20:52
"""
class RMM:
def __init__(self, window_size=3):
self.window_size = window_size # 窗口大小
self._simple_num = 0 # 分词后单字数量
def cut(self, text):
# 分词后的结果list
result = []
index = len(text)
# 前向最大匹配算法核心
dict = [
"研究", "研究生", "生命", "命", "的", "起源"
]
piece = ''
# 逆向最大匹配核心算法
while index > 0:
for size in range(index-self.window_size, index):
piece = text[size:index]
if piece in dict:
if index-size ==1:
self._simple_num += 1
index = size + 1
result.append(piece)
break
index = index - 1
result.reverse()
return result
运行:
text = '研究生命的起源'
tokenizer = RMM()
result = tokenizer.cut(text)
num = tokenizer._simple_num
print(result)
print(num)