如何设计高效的聊天过滤词算法?
发布日期:2021-11-07 23:21:03 浏览次数:8 分类:技术文章

本文共 5395 字,大约阅读时间需要 17 分钟。

关于聊天过滤词算法,一直困扰着我,了解到很多算法,比如:KMP, 正则循环匹配等,然后在看到了一篇文章,现摘要几种相对好的答案,以备不时之需。

1trie树算法

我们的解决方法是用构造一个tire树。 每个节点都存储0- 256个字符。

用脏词字典来构造这个树。

树的结构大概如下

具体实现代码如下:

namespace KGame{class WordFilter{public:    WordFilter() {}    ~WordFilter()     {        Clean(&m_Filter);    }    void AddWord(const char* word)    {        UInt32 len = (UInt32)strlen(word);        Filter* filter = &m_Filter;        for (UInt32 i = 0; i < len; i++)        {            unsigned char c = word[i];            if (i == len - 1)            {                filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_IS_END;                break;            }            else            {                filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_HAS_NEXT;            }            if (filter->m_NodeArray[c].m_NextFilter == NULL)            {                Filter* tmpFilter = XNEW (Filter)();                filter->m_NodeArray[c].m_NextFilter = tmpFilter;            }            filter = (Filter *)filter->m_NodeArray[c].m_NextFilter;        }    }    void AddWords(const std::set
& wordList) { for (std::set
::const_iterator it = wordList.begin(); it != wordList.end(); it++) { AddWord(it->c_str()); } } void AddWords(const std::vector
& wordList) { for (std::vector
::const_iterator it = wordList.begin(); it != wordList.end(); it++) { AddWord(it->c_str()); } } void AddWords(const KGame::Set
& worldList) { for (KGame::Set
::Iter* iter = worldList.Begin(); iter != worldList.End(); iter = worldList.Next(iter)) { AddWord(iter->m_Value.c_str()); } } Int32 Check(const char* str) { Filter* filter = NULL; for (Int32 i = 0; i < (int)strlen(str) - 1; i++) { filter = &m_Filter; for (UInt32 j = i; j < strlen(str); j++) { unsigned char c = str[j]; if ((c >= 'A' && c <= 'Z')) { c += 32; } if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL) { break; } else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END) { return i; } else // NODE_HAS_NEXT { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } } } return -1; } void CheckAndModify(char* str, const char replace = '*') { Filter* filter = NULL; for (Int32 i = 0; i < (int)strlen(str) - 1; i++) { filter = &m_Filter; for (UInt32 j = i; j < strlen(str); j++) { unsigned char c = str[j]; if ((c >= 'A' && c <= 'Z')) { c += 32; } if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL) { break; } else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END) { for (UInt32 k = i; k <= j; k++) { str[k] = replace; } if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT) { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } else { continue; } } else // NODE_HAS_NEXT { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } } } } void CheckAndModify(std::string& str, const char replace = '*') { Filter* filter = NULL; for (Int32 i = 0; i < (int)str.size() - 1; i++) { filter = &m_Filter; for (UInt32 j = i; j < str.size(); j++) { unsigned char c = str[j]; if ((c >= 'A' && c <= 'Z')) { c += 32; } if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL) { break; } else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END) { for (UInt32 k = i; k <= j; k++) { str[k] = replace; } if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT) { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } else { continue; } } else // NODE_HAS_NEXT { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } } } }private: struct FilterNode { char m_Flag; void* m_NextFilter; enum Flag { NODE_IS_NULL = 0x00, NODE_HAS_NEXT = 0x01, NODE_IS_END = 0x10, }; FilterNode() : m_Flag(NODE_IS_NULL), m_NextFilter(NULL) {} }; struct Filter { FilterNode m_NodeArray[256]; } m_Filter; void Clean(Filter* filter) { for (UInt32 i = 0; i < 256; i++) { if (filter->m_NodeArray[i].m_NextFilter) { Clean((Filter *)filter->m_NodeArray[i].m_NextFilter); XDELETE((Filter*)filter->m_NodeArray[i].m_NextFilter); } } }};} // namespace KGame
2.基于KMP算法

聊天过滤词算法的解决思路
提高过滤的算法个人认为主要从两个方面考虑:(1)尽量减少内存、IO的次数。(2)增加串内查找的速度。
基于这两点我想采用连续的内存片,可以减少内存地址跳跃的次数,采用静态的内存这就解决了(1)的问题,第二点是增加串内查找的速度,这个比较公认的事KMP算法

class WordFilter{public:WordFilter();~WordFilter();void Init();void FilterWord(string& word);int  Index_KMP(const char* S, const char* T, int pos);private:std::set
m_storage;const char** m_words;uint32 m_count;};WordFilter::WordFilter(){m_words = NULL;m_count = 0;}WordFilter::~WordFilter(){if(m_words) {free(m_words);}}void WordFilter::Init(){// 把所有屏蔽词都放到m_storage里m_count = m_storage.size();if(m_count) {m_words = (const char**)malloc(sizeof(char*)*m_count);std::set
::iterator ptr;int i = 0;for(ptr = m_storage.begin(); ptr != m_storage.end(); ++ptr,i++) {m_words[i] = ptr->c_str();}}}static inline void _filterWord(char* word, const char* lowerWord, const char* oldstr){int len = strlen(oldstr);const char* tmp;memset(word, '*', len);word += len;lowerWord += len;while((tmp = Index_KMP(lowerWord, oldstr)) != NULL) {word += (tmp-lowerWord);memset(word, '*', len);word += len;lowerWord = tmp + len;}}void WordFilter::FilterWord(string& word){string tmp(word);str_tolower(tmp);const char** p = (const char**)m_words;const char* dest;for(uint32 i=0; i
T[0]) return i-T[0]; else return 0; }
以上两种方法相对比较好一点。以做参考。

转载地址:https://blog.csdn.net/zhangwu416826/article/details/7838479 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:new与malloc的区别
下一篇:mangos源码分析--计划

发表评论

最新留言

哈哈,博客排版真的漂亮呢~
[***.90.31.176]2024年04月16日 20时20分09秒

关于作者

    喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!

推荐文章