C#敏感词过滤算法实现

您所在的位置:网站首页 过滤关键词 C#敏感词过滤算法实现

C#敏感词过滤算法实现

2024-07-15 16:43| 来源: 网络整理| 查看: 265

原文:https://blog.csdn.net/u011966339/article/details/72832197

 

1.DFA算法简介DFA全称为:Deterministic Finite Automaton,即确定有穷自动机。其特征为:有一个有限状态集合和一些从一个状态通向另一个状态的边,每条边上标记有一个符号,其中一个状态是初态,某些状态是终态。但不同于不确定的有限自动机,DFA中不会有从同一状态出发的两条边标志有相同的符号。

 

 

 

简单点说就是,它是是通过event和当前的state得到下一个state,即event+state=nextstate。理解为系统中有多个节点,通过传递进入的event,来确定走哪个路由至另一个节点,而节点是有限的。

2.实现代码如下:新建一个FilterHelper.cs类,放敏感词的过滤统一处理方法

————————————————

using Microsoft.VisualBasic; using System; using System.Collections.Generic; using System.Text; namespace ConsoleApp1 { #region 非法关键字过滤 bate 1.1 /// /// 非法关键词过滤(自动忽略汉字数字字母间的其他字符) /// public class FilterHelper { public FilterHelper() { } public FilterHelper(string dictionaryPath) { this.dictionaryPath = dictionaryPath; } private string dictionaryPath = string.Empty; /// /// 词库路径 /// public string DictionaryPath { get { return dictionaryPath; } set { dictionaryPath = value; } } /// /// 内存词典 /// private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue]; private string sourctText = string.Empty; /// /// 检测源 /// public string SourctText { get { return sourctText; } set { sourctText = value; } } /// /// 检测源游标 /// int cursor = 0; /// /// 匹配成功后偏移量 /// int wordlenght = 0; /// /// 检测词游标 /// int nextCursor = 0; private List illegalWords = new List(); /// /// 检测到的非法词集 /// public List IllegalWords { get { return illegalWords; } } /// /// 判断是否是中文 /// /// /// private bool isCHS(char character) { // 中文表意字符的范围 4E00-9FA5 int charVal = (int)character; return (charVal >= 0x4e00 && charVal = 48 && charVal = 97 && charVal = 65 && charVal 65280 && c[i] < 65375) c[i] = (char)(c[i] - 65248); } return new string(c).ToLower(); } /// /// 加载内存词库 /// private void LoadDictionary() { if (DictionaryPath != string.Empty) { List wordList = new List(); Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length); string[] words = System.IO.File.ReadAllLines(DictionaryPath, System.Text.Encoding.Default); foreach (string word in words) { string key = this.ToDBC(word); wordList.Add(key); // wordList.Add(Strings.StrConv(key, VbStrConv.SimplifiedChinese, 0)); //这个是将字符转成简体中文,需要引入Microsoft.VisualBasic,同时,有些系统不支持。 } Comparison cmp = delegate (string key1, string key2) { return key1.CompareTo(key2); }; wordList.Sort(cmp); for (int i = wordList.Count - 1; i > 0; i--) { if (wordList[i].ToString() == wordList[i - 1].ToString()) { wordList.RemoveAt(i); } } foreach (var word in wordList) { if (word.Length > 0) { WordGroup group = MEMORYLEXICON[(int)word[0]]; if (group == null) { group = new WordGroup(); MEMORYLEXICON[(int)word[0]] = group; } group.Add(word.Substring(1)); } } } } /// /// 检测 /// /// /// private bool Check(string blackWord) { wordlenght = 0; //检测源下一位游标 nextCursor = cursor + 1; bool found = false; //遍历词的每一位做匹配 for (int i = 0; i < blackWord.Length; i++) { //特殊字符偏移游标 int offset = 0; if (nextCursor >= sourctText.Length) { break; } else { //检测下位字符如果不是汉字 数字 字符 偏移量加1 for (int y = nextCursor; y < sourctText.Length; y++) { if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y])) { offset++; //避让特殊字符,下位游标如果>=字符串长度 跳出 if (nextCursor + offset >= sourctText.Length) break; wordlenght++; } else break; } if ((int)blackWord[i] == (int)sourctText[nextCursor + offset]) { found = true; } else { found = false; break; } } nextCursor = nextCursor + 1 + offset; wordlenght++; } return found; } /// /// 查找并替换 /// /// public string Filter(char replaceChar) { LoadDictionary(); if (sourctText != string.Empty) { char[] tempString = sourctText.ToCharArray(); for (int i = 0; i < SourctText.Length; i++) { //查询以该字为首字符的词组 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]]; if (group != null) { for (int z = 0; z < group.Count(); z++) { string word = group.GetWord(z); if (word.Length == 0 || Check(word)) { string blackword = string.Empty; for (int pos = 0; pos < wordlenght + 1; pos++) { blackword += tempString[pos + cursor].ToString(); tempString[pos + cursor] = replaceChar; } illegalWords.Add(blackword); cursor = cursor + wordlenght; i = i + wordlenght; } } } cursor++; } return new string(tempString); } else { return string.Empty; } } } /// /// 具有相同首字符的词组集合 /// class WordGroup { /// /// 集合 /// private List groupList; public WordGroup() { groupList = new List(); } /// /// 添加词 /// /// public void Add(string word) { groupList.Add(word); } /// /// 获取总数 /// /// public int Count() { return groupList.Count; } /// /// 根据下标获取词 /// /// /// public string GetWord(int index) { return groupList[index]; } } #endregion }

 

使用:

static void Main(string[] args) { //该代码为Net Core下的控制台demo string path = Directory.GetCurrentDirectory().Replace("\\bin\\Debug\\netcoreapp3.1",""); FilterHelper filter = new FilterHelper(path+"/暴恐词库.txt"); //存放敏感词的文档 filter.SourctText = "你个大推背"; string resultStr = filter.Filter('*'); //用*号代替敏感词 var list=filter.IllegalWords; //这个可以获取所有敏感词集合 foreach(string s in list) { Console.WriteLine(s); } Console.WriteLine(resultStr); }

 

另附Demo的敏感词下载:https://github.com/chason777777/mgck/archive/master.zip



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3