Chris's coding blog

How to Write a Spelling Corrector in C#

April 05, 2013

Github repository - feel free to send improvements

Peter Norvig’s spelling corrector is fairly famous in nerd-circles as it describes the first steps in creating a Google-style spelling corrector that will take something like “Speling”, recognise its closest word and reply “Did you mean Spelling?”.

His original is a few years old now, and only 21 lines of compact Python. Below is my attempt to convert it to C#. There are already some links to C# conversions on Peter Norvig’s page, however I wanted one that was closer to C# and didn’t rely on a 3rd party library for collection helpers, as Frederic Torres’s does. The other C# version was a 404 last time I looked.

Hopefully there’s no obvious errors, but feel free to reply if there are - I am a Python newbie and got a lot of help from the Java conversion and trawling through a few Python tutorials on its powerful-but-hard-to-read (but admittedly really concise) set syntax, and also with the help of Simon my colleague pointing out some glaring errors. I haven’t gone for brevity, as it’s 140+ lines of code, nor efficiency. Peter Norvig describes some speed ups you can perform on the original page, and one obvious one is to use a standard dictionary file and store this in a Bloom filter, with the trained words stored in the same dictionary format, looking through the bloom filter as a second measure.

The dictionary big.txt can be found here.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
namespace SpellingCorrector
{
/// <summary>
/// Conversion from http://norvig.com/spell-correct.html by C.Small
/// </summary>
public class Spelling
{
private Dictionary<String, int> _dictionary = new Dictionary<String, int>();
private static Regex _wordRegex = new Regex("[a-z]+", RegexOptions.Compiled);
public Spelling()
{
string fileContent = File.ReadAllText("big.txt");
List<string> wordList = fileContent.Split(new string[] {"\n"} , StringSplitOptions.RemoveEmptyEntries).ToList();
foreach (var word in wordList)
{
string trimmedWord = word.Trim().ToLower();
if (_wordRegex.IsMatch(trimmedWord))
{
if (_dictionary.ContainsKey(trimmedWord))
_dictionary[trimmedWord]++;
else
_dictionary.Add(trimmedWord, 1);
}
}
}
public string Correct(string word)
{
if (string.IsNullOrEmpty(word))
return word;
word = word.ToLower();
// known()
if (_dictionary.ContainsKey(word))
return word;
List<String> list = Edits(word);
Dictionary<string, int> candidates = new Dictionary<string, int>();
foreach (string wordVariation in list)
{
if (_dictionary.ContainsKey(wordVariation) && !candidates.ContainsKey(wordVariation))
candidates.Add(wordVariation, _dictionary[wordVariation]);
}
if (candidates.Count > 0)
return candidates.OrderByDescending(x => x.Value).First().Key;
// known_edits2()
foreach (string item in list)
{
foreach (string wordVariation in Edits(item))
{
if (_dictionary.ContainsKey(wordVariation) && !candidates.ContainsKey(wordVariation))
candidates.Add(wordVariation, _dictionary[wordVariation]);
}
}
return (candidates.Count > 0) ? candidates.OrderByDescending(x => x.Value).First().Key : word;
}
private List<string> Edits(string word)
{
var splits = new List<Tuple<string, string>>();
var transposes = new List<string>();
var deletes = new List<string>();
var replaces = new List<string>();
var inserts = new List<string>();
// Splits
for (int i = 0; i < word.Length; i++)
{
var tuple = new Tuple<string, string>(word.Substring(0, i), word.Substring(i));
splits.Add(tuple);
}
// Deletes
for (int i = 0; i < splits.Count; i++)
{
string a = splits[i].Item1;
string b = splits[i].Item2;
if (!string.IsNullOrEmpty(b))
{
deletes.Add(a + b.Substring(1));
}
}
// Transposes
for (int i = 0; i < splits.Count; i++)
{
string a = splits[i].Item1;
string b = splits[i].Item2;
if (b.Length > 1)
{
transposes.Add(a + b[1] + b[0] + b.Substring(2));
}
}
// Replaces
for (int i = 0; i < splits.Count; i++)
{
string a = splits[i].Item1;
string b = splits[i].Item2;
if (!string.IsNullOrEmpty(b))
{
for (char c = 'a'; c <= 'z'; c++)
{
replaces.Add(a + c + b.Substring(1));
}
}
}
// Inserts
for (int i = 0; i < splits.Count; i++)
{
string a = splits[i].Item1;
string b = splits[i].Item2;
for (char c = 'a'; c <= 'z'; c++)
{
inserts.Add(a + c + b);
}
}
return deletes.Union(transposes).Union(replaces).Union(inserts).ToList();
}
}
}

class Program
{
static void Main(string[] args)
{
Spelling spelling = new Spelling();
string word = "";
word = "speling";
Console.WriteLine("{0} => {1}", word, spelling.Correct(word));
word = "korrecter"; // 'correcter' is not in the dictionary file so this doesn't work
Console.WriteLine("{0} => {1}", word, spelling.Correct(word));
word = "korrect";
Console.WriteLine("{0} => {1}", word, spelling.Correct(word));
word = "acess";
Console.WriteLine("{0} => {1}", word, spelling.Correct(word));
word = "supposidly";
Console.WriteLine("{0} => {1}", word, spelling.Correct(word));
// A sentence
string sentence = "I havve speled thes woord wwrong"; // sees speed instead of spelled (see notes on norvig.com)
string correction = "";
foreach (string item in sentence.Split(' '))
{
correction += " " +spelling.Correct(item);
}
Console.WriteLine("Did you mean:" + correction);
Console.Read();
}
}
view raw gistfile1.cs hosted with ❤ by GitHub

csharp

Chris Small

I'm Chris Small, a software engineer living and working in London. This is my tech blog. Find out more via GithubStackoverflowResume