How to Write a Spelling Corrector in C#
April 05, 2013
Github repository - feel free to send improvements
Peter Norvig’s spelling corrector is fairly famous in nerd-circles as it describes the first steps in creating a Google-style spelling corrector that will take something like “Speling”, recognise its closest word and reply “Did you mean Spelling?”.
His original is a few years old now, and only 21 lines of compact Python. Below is my attempt to convert it to C#. There are already some links to C# conversions on Peter Norvig’s page, however I wanted one that was closer to C# and didn’t rely on a 3rd party library for collection helpers, as Frederic Torres’s does. The other C# version was a 404 last time I looked.
Hopefully there’s no obvious errors, but feel free to reply if there are - I am a Python newbie and got a lot of help from the Java conversion and trawling through a few Python tutorials on its powerful-but-hard-to-read (but admittedly really concise) set syntax, and also with the help of Simon my colleague pointing out some glaring errors. I haven’t gone for brevity, as it’s 140+ lines of code, nor efficiency. Peter Norvig describes some speed ups you can perform on the original page, and one obvious one is to use a standard dictionary file and store this in a Bloom filter, with the trained words stored in the same dictionary format, looking through the bloom filter as a second measure.
The dictionary big.txt can be found here.
| using System; | |
| using System.Collections.Generic; | |
| using System.IO; | |
| using System.Linq; | |
| using System.Text.RegularExpressions; | |
| namespace SpellingCorrector | |
| { | |
| /// <summary> | |
| /// Conversion from http://norvig.com/spell-correct.html by C.Small | |
| /// </summary> | |
| public class Spelling | |
| { | |
| private Dictionary<String, int> _dictionary = new Dictionary<String, int>(); | |
| private static Regex _wordRegex = new Regex("[a-z]+", RegexOptions.Compiled); | |
| public Spelling() | |
| { | |
| string fileContent = File.ReadAllText("big.txt"); | |
| List<string> wordList = fileContent.Split(new string[] {"\n"} , StringSplitOptions.RemoveEmptyEntries).ToList(); | |
| foreach (var word in wordList) | |
| { | |
| string trimmedWord = word.Trim().ToLower(); | |
| if (_wordRegex.IsMatch(trimmedWord)) | |
| { | |
| if (_dictionary.ContainsKey(trimmedWord)) | |
| _dictionary[trimmedWord]++; | |
| else | |
| _dictionary.Add(trimmedWord, 1); | |
| } | |
| } | |
| } | |
| public string Correct(string word) | |
| { | |
| if (string.IsNullOrEmpty(word)) | |
| return word; | |
| word = word.ToLower(); | |
| // known() | |
| if (_dictionary.ContainsKey(word)) | |
| return word; | |
| List<String> list = Edits(word); | |
| Dictionary<string, int> candidates = new Dictionary<string, int>(); | |
| foreach (string wordVariation in list) | |
| { | |
| if (_dictionary.ContainsKey(wordVariation) && !candidates.ContainsKey(wordVariation)) | |
| candidates.Add(wordVariation, _dictionary[wordVariation]); | |
| } | |
| if (candidates.Count > 0) | |
| return candidates.OrderByDescending(x => x.Value).First().Key; | |
| // known_edits2() | |
| foreach (string item in list) | |
| { | |
| foreach (string wordVariation in Edits(item)) | |
| { | |
| if (_dictionary.ContainsKey(wordVariation) && !candidates.ContainsKey(wordVariation)) | |
| candidates.Add(wordVariation, _dictionary[wordVariation]); | |
| } | |
| } | |
| return (candidates.Count > 0) ? candidates.OrderByDescending(x => x.Value).First().Key : word; | |
| } | |
| private List<string> Edits(string word) | |
| { | |
| var splits = new List<Tuple<string, string>>(); | |
| var transposes = new List<string>(); | |
| var deletes = new List<string>(); | |
| var replaces = new List<string>(); | |
| var inserts = new List<string>(); | |
| // Splits | |
| for (int i = 0; i < word.Length; i++) | |
| { | |
| var tuple = new Tuple<string, string>(word.Substring(0, i), word.Substring(i)); | |
| splits.Add(tuple); | |
| } | |
| // Deletes | |
| for (int i = 0; i < splits.Count; i++) | |
| { | |
| string a = splits[i].Item1; | |
| string b = splits[i].Item2; | |
| if (!string.IsNullOrEmpty(b)) | |
| { | |
| deletes.Add(a + b.Substring(1)); | |
| } | |
| } | |
| // Transposes | |
| for (int i = 0; i < splits.Count; i++) | |
| { | |
| string a = splits[i].Item1; | |
| string b = splits[i].Item2; | |
| if (b.Length > 1) | |
| { | |
| transposes.Add(a + b[1] + b[0] + b.Substring(2)); | |
| } | |
| } | |
| // Replaces | |
| for (int i = 0; i < splits.Count; i++) | |
| { | |
| string a = splits[i].Item1; | |
| string b = splits[i].Item2; | |
| if (!string.IsNullOrEmpty(b)) | |
| { | |
| for (char c = 'a'; c <= 'z'; c++) | |
| { | |
| replaces.Add(a + c + b.Substring(1)); | |
| } | |
| } | |
| } | |
| // Inserts | |
| for (int i = 0; i < splits.Count; i++) | |
| { | |
| string a = splits[i].Item1; | |
| string b = splits[i].Item2; | |
| for (char c = 'a'; c <= 'z'; c++) | |
| { | |
| inserts.Add(a + c + b); | |
| } | |
| } | |
| return deletes.Union(transposes).Union(replaces).Union(inserts).ToList(); | |
| } | |
| } | |
| } |
| class Program | |
| { | |
| static void Main(string[] args) | |
| { | |
| Spelling spelling = new Spelling(); | |
| string word = ""; | |
| word = "speling"; | |
| Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
| word = "korrecter"; // 'correcter' is not in the dictionary file so this doesn't work | |
| Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
| word = "korrect"; | |
| Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
| word = "acess"; | |
| Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
| word = "supposidly"; | |
| Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
| // A sentence | |
| string sentence = "I havve speled thes woord wwrong"; // sees speed instead of spelled (see notes on norvig.com) | |
| string correction = ""; | |
| foreach (string item in sentence.Split(' ')) | |
| { | |
| correction += " " +spelling.Correct(item); | |
| } | |
| Console.WriteLine("Did you mean:" + correction); | |
| Console.Read(); | |
| } | |
| } |
I'm Chris Small, a software engineer working in London. This is my tech blog. Find out more about me via Github, Stackoverflow, Resume