How to Write a Spelling Corrector in C#
April 05, 2013
Github repository - feel free to send improvements
Peter Norvig’s spelling corrector is fairly famous in nerd-circles as it describes the first steps in creating a Google-style spelling corrector that will take something like “Speling”, recognise its closest word and reply “Did you mean Spelling?”.
His original is a few years old now, and only 21 lines of compact Python. Below is my attempt to convert it to C#. There are already some links to C# conversions on Peter Norvig’s page, however I wanted one that was closer to C# and didn’t rely on a 3rd party library for collection helpers, as Frederic Torres’s does. The other C# version was a 404 last time I looked.
Hopefully there’s no obvious errors, but feel free to reply if there are - I am a Python newbie and got a lot of help from the Java conversion and trawling through a few Python tutorials on its powerful-but-hard-to-read (but admittedly really concise) set syntax, and also with the help of Simon my colleague pointing out some glaring errors. I haven’t gone for brevity, as it’s 140+ lines of code, nor efficiency. Peter Norvig describes some speed ups you can perform on the original page, and one obvious one is to use a standard dictionary file and store this in a Bloom filter, with the trained words stored in the same dictionary format, looking through the bloom filter as a second measure.
The dictionary big.txt can be found here.
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
namespace SpellingCorrector | |
{ | |
/// <summary> | |
/// Conversion from http://norvig.com/spell-correct.html by C.Small | |
/// </summary> | |
public class Spelling | |
{ | |
private Dictionary<String, int> _dictionary = new Dictionary<String, int>(); | |
private static Regex _wordRegex = new Regex("[a-z]+", RegexOptions.Compiled); | |
public Spelling() | |
{ | |
string fileContent = File.ReadAllText("big.txt"); | |
List<string> wordList = fileContent.Split(new string[] {"\n"} , StringSplitOptions.RemoveEmptyEntries).ToList(); | |
foreach (var word in wordList) | |
{ | |
string trimmedWord = word.Trim().ToLower(); | |
if (_wordRegex.IsMatch(trimmedWord)) | |
{ | |
if (_dictionary.ContainsKey(trimmedWord)) | |
_dictionary[trimmedWord]++; | |
else | |
_dictionary.Add(trimmedWord, 1); | |
} | |
} | |
} | |
public string Correct(string word) | |
{ | |
if (string.IsNullOrEmpty(word)) | |
return word; | |
word = word.ToLower(); | |
// known() | |
if (_dictionary.ContainsKey(word)) | |
return word; | |
List<String> list = Edits(word); | |
Dictionary<string, int> candidates = new Dictionary<string, int>(); | |
foreach (string wordVariation in list) | |
{ | |
if (_dictionary.ContainsKey(wordVariation) && !candidates.ContainsKey(wordVariation)) | |
candidates.Add(wordVariation, _dictionary[wordVariation]); | |
} | |
if (candidates.Count > 0) | |
return candidates.OrderByDescending(x => x.Value).First().Key; | |
// known_edits2() | |
foreach (string item in list) | |
{ | |
foreach (string wordVariation in Edits(item)) | |
{ | |
if (_dictionary.ContainsKey(wordVariation) && !candidates.ContainsKey(wordVariation)) | |
candidates.Add(wordVariation, _dictionary[wordVariation]); | |
} | |
} | |
return (candidates.Count > 0) ? candidates.OrderByDescending(x => x.Value).First().Key : word; | |
} | |
private List<string> Edits(string word) | |
{ | |
var splits = new List<Tuple<string, string>>(); | |
var transposes = new List<string>(); | |
var deletes = new List<string>(); | |
var replaces = new List<string>(); | |
var inserts = new List<string>(); | |
// Splits | |
for (int i = 0; i < word.Length; i++) | |
{ | |
var tuple = new Tuple<string, string>(word.Substring(0, i), word.Substring(i)); | |
splits.Add(tuple); | |
} | |
// Deletes | |
for (int i = 0; i < splits.Count; i++) | |
{ | |
string a = splits[i].Item1; | |
string b = splits[i].Item2; | |
if (!string.IsNullOrEmpty(b)) | |
{ | |
deletes.Add(a + b.Substring(1)); | |
} | |
} | |
// Transposes | |
for (int i = 0; i < splits.Count; i++) | |
{ | |
string a = splits[i].Item1; | |
string b = splits[i].Item2; | |
if (b.Length > 1) | |
{ | |
transposes.Add(a + b[1] + b[0] + b.Substring(2)); | |
} | |
} | |
// Replaces | |
for (int i = 0; i < splits.Count; i++) | |
{ | |
string a = splits[i].Item1; | |
string b = splits[i].Item2; | |
if (!string.IsNullOrEmpty(b)) | |
{ | |
for (char c = 'a'; c <= 'z'; c++) | |
{ | |
replaces.Add(a + c + b.Substring(1)); | |
} | |
} | |
} | |
// Inserts | |
for (int i = 0; i < splits.Count; i++) | |
{ | |
string a = splits[i].Item1; | |
string b = splits[i].Item2; | |
for (char c = 'a'; c <= 'z'; c++) | |
{ | |
inserts.Add(a + c + b); | |
} | |
} | |
return deletes.Union(transposes).Union(replaces).Union(inserts).ToList(); | |
} | |
} | |
} |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
Spelling spelling = new Spelling(); | |
string word = ""; | |
word = "speling"; | |
Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
word = "korrecter"; // 'correcter' is not in the dictionary file so this doesn't work | |
Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
word = "korrect"; | |
Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
word = "acess"; | |
Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
word = "supposidly"; | |
Console.WriteLine("{0} => {1}", word, spelling.Correct(word)); | |
// A sentence | |
string sentence = "I havve speled thes woord wwrong"; // sees speed instead of spelled (see notes on norvig.com) | |
string correction = ""; | |
foreach (string item in sentence.Split(' ')) | |
{ | |
correction += " " +spelling.Correct(item); | |
} | |
Console.WriteLine("Did you mean:" + correction); | |
Console.Read(); | |
} | |
} |
I'm Chris Small, a software engineer working in London. This is my tech blog. Find out more about me via Github, Stackoverflow, Resume