600字范文,内容丰富有趣,生活中的好帮手!
600字范文 > php自定义词库简单分词 敏感词替换

php自定义词库简单分词 敏感词替换

时间:2021-08-14 13:36:35

相关推荐

php自定义词库简单分词 敏感词替换

关键词匹配类

<?phpnamespace App\Library;use App\Library\Redis;class SensitiveWordFilter{protected $dict; //生成的词库文件protected $key; //redis键名public function __construct($key){$this->dict = [];$this->key = $key;}public function loadData($data,$time = 7200){ini_set("memory_limit", "2048M");set_time_limit(0);if(!Redis::connection()->hkeys($this->key)){Redis::connection()->del($this->key);}foreach ($data as $v){if (empty($v)) {continue;}$this->addWords(trim($v));}Redis::connection()->setex($this->key,$time,json_encode($this->dict, JSON_UNESCAPED_UNICODE));}public function checkLKey(){return Redis::connection()->exists($this->key);}/*** 分割文本(注意ascii占1个字节, unicode...)** @param string $str** @return string[]*/protected function splitStr($str){return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);}/*** 往dict树中添加语句** @param $wordArr*/protected function addWords($words){$wordArr = $this->splitStr($words);$curNode = &$this->dict;foreach ($wordArr as $char) {if (!isset($curNode)) {$curNode[$char] = [];}$curNode = &$curNode[$char];}// 标记到达当前节点完整路径为"敏感词"$curNode['end'] = 1;}/*** 过滤文本** @param string $str 原始文本* @param string $replace 敏感字替换字符* @param int $skipDistance 严格程度: 检测时允许跳过的间隔** @return string 返回过滤后的文本*/public function filter($str, $replace = '*', $skipDistance = 0){$finalRes = [];$this->dict = json_decode(Redis::connection()->get($this->key),true);$maxDistance = max($skipDistance, 0) + 1;$strArr = $this->splitStr($str);$length = count($strArr);for ($i = 0; $i < $length; $i++) {$char = $strArr[$i];if (!isset($this->dict[$char])) {continue;}$curNode = &$this->dict[$char];$dist = 0;$matchIndex = [$i];for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {if (!isset($curNode[$strArr[$j]])) {$dist ++;continue;}$matchIndex[] = $j;$curNode = &$curNode[$strArr[$j]];}// 匹配if (isset($curNode['end'])) {$res = [];foreach ($matchIndex as $index) {$res[] = $strArr[$index];$strArr[$index] = $replace;}$finalRes[] = implode("",$res);unset($res);$i = max($matchIndex);}}return $finalRes; //输出匹配到的关键词// return implode('', $strArr); //输出替换内容}/*** 确认所给语句是否为敏感词** @param $strArr** @return bool|mixed*/public function isMatch($strArr){$strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);$curNode = &$this->dict;foreach ($strArr as $char) {if (!isset($curNode[$char])) {return false;}}// return $curNode['end'] ?? false; // php 7return isset($curNode['end']) ? $curNode['end'] : false;}}

调用示例

$wordFilter = new SensitiveWordFilter('keywords_dict');//检查词库文件是否存在,不存在重新生成if(!$wordFilter->checkLKey()){$keywordData = Keyword::query()->pluck('keyword');if(!empty($keywordData)){$keywordData = $keywordData->toArray();$wordFilter->loadData($keywordData);}unset($keywordData);}//开始匹配$keywords = $wordFilter->filter(‘努力读书,报效祖国’);var_dump($keywords);// "读书,祖国"

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。