Saturday, January 4, 2014

Spell Check

I am using self balanced binary search tree to create the dictionary because the hash values for the dictionary wordsEn.txt vary from few thousands to few thousands of millions. If I want to map this big range to small one, there may be more collision.

My hash function gives me same hash value for the strings even if the character are jumbled. For example, "add" and "dad" will have the same hash value. I collect the list of words which have the same has value as the given string and find how close these words to the given string. I sort these suggestions in the order where the closest possibility comes first.

To get more words, i repeat this with adding a character each time from 'a' to 'z' with the given string and one more time with removing the character from all the possible indices.
Finally print the list.

Filename: dictionary.hpp
#ifndef __MY_DICTIONARY__
#define __MY_DICTIONARY__

#include <iostream>
#include <string>
using namespace std;

typedef struct word t_word;

class wordlist {
private:
  string dictionaryfile;
  t_word *dictionary;

public:
  wordlist (string dictfile)
  {
    this->dictionaryfile = dictfile;
    this->dictionary = NULL;
  }

  wordlist()
  {
  }

  void setWordFile (string wordfile)
  {
    this->dictionaryfile = wordfile;
  }

  string getWordFile ()
  {
    return this->dictionaryfile;
  }

  ~wordlist ()
  {
  }

  bool generateDictionary ();
  unsigned int calcHash (string);
  t_word *create (string);
  void rotateLeft (t_word*);
  void rotateRight (t_word*);
  void balanceFactors (t_word*, t_word*);
  void balanceLeftRight (t_word*, t_word*);
  void balanceRightLeft (t_word*, t_word*);
  void balance (t_word*, t_word*);
  void insert (string);
  int getScore (string, string);
  int findMatch (string, vector<string>&);
};

#endif
Filename: dictionary.cpp
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <stdlib.h>

using namespace std;

#include "dictionary.hpp"

// Node to store the word and the corresponding hash values
struct word{
  string word;
  unsigned int hash;

  // Pointers to create the self-balanced binary search tree
  t_word *parent;
  t_word *left;
  t_word *right;
  char balanceFactor;
  
  // Pointer to store the words with the same hash value
  t_word *next;
};

// Generates the dictionary (self-balanced binary search tree)
// with the list of words from the given dictionary file.
bool wordlist::generateDictionary ()
{
  ifstream ifs (this->dictionaryfile.c_str());
  string str;
  
  try 
  {
    while (getline (ifs, str))
    {
      // Remove the new line 
      str.resize (str.length()-1);

      // Add the word into the dictionary
      insert (str);
    }
    ifs.close();
  } 
  catch (exception& e)
  {
    cerr << "[Error]: " << e.what() << endl;
    if (ifs.is_open())
    {
      ifs.close();
    }
    return false;
  }
  
  return true;
}

// Calculates the hash value for the given word
// Position of the characters does not have importance here.
unsigned int wordlist::calcHash (string word)
{
  unsigned int hash = 0;
  int val[26] = {0};
  const char *exp = word.c_str();
  while (('a' <= *exp && 
          'z' >= *exp) ||
         '\'' == *exp)
  {
    hash = hash + ((
                    (int(*exp) & 0x01) * 0xcf * 3+ 
                    (int(*exp) & 0x02) * 0xbf * 37+
                    (int(*exp) & 0x04) * 0xaf * 79+
                    (int(*exp) & 0x08) * 0x9f * 131+
                    (int(*exp) & 0x10) * 0x8f * 181+
                    (int(*exp) & 0x20) * 0x7f * 239+
                    (int(*exp) & 0x40) * 0x6f * 293+
                    (int(*exp) & 0x80) * 0x5f * 359) 
                   * (int (*exp) - int ('a') + 1) * 421 ) +
      (int (*exp) - int ('a') + 1) * 17;
      exp++;
  }
  hash += word.length();
  return hash;
}

// Create a new node with the given string and its has value
t_word* wordlist::create (string str)
{
  unsigned int hash = calcHash (str);
  t_word *t = new t_word;
  if (NULL == t)
  {
    cerr << "[Error] memory allocation failed." << endl;
    exit (1);
  }

  // cout << hash << " [" << str << "]" << endl;
  t->word = str;
  t->hash = hash;
  t->parent = NULL;
  t->left = NULL;
  t->right = NULL;
  t->next = NULL;
  t->balanceFactor = '=';
  
  return t;
}

void wordlist::rotateLeft (t_word *n)
{
  t_word *t = n->right;
  
  n->right = t->left;
  if (NULL != t->left)
  {
    t->left->parent = n;
  }
  
  if (NULL != n->parent)
  {
    if (n->parent->left == n)
    {
      n->parent->left = t;
    }
    else
    {
      n->parent->right = t;
    }
  }
  else
  {
    this->dictionary = t;
    t->parent = NULL;
  }
  t->left = n;
  t->parent = n->parent;
  n->parent = t;
}

void wordlist::rotateRight (t_word *n)
{
  t_word *t = n->left;
  
  if (NULL == n->left && NULL == n->right)
    return;
  
  n->left = t->right;
  if (NULL != t->right)
  {
    t->right->parent = n;
  }
  
  if (NULL != n->parent)
  {
    if (n->parent->left == n)
    {
      n->parent->left = t;
    }
    else
    {
      n->parent->right = t;
    }
  }
  else
  {
    this->dictionary = t;
    t->parent = NULL;
  }
  t->right = n;
  t->parent = n->parent;
  n->parent = t;
}

void wordlist::balanceFactors (t_word *ancestor, t_word *newLeaf)
{
  t_word *t = newLeaf->parent;
  while (t != NULL && t != ancestor)
  {
    if (newLeaf->hash <= t->hash)
      t->balanceFactor = 'L';
    else
      t->balanceFactor = 'R';
    
    t = t->parent;
  }
}

void wordlist::balanceLeftRight (t_word *ancestor, t_word *newLeaf)
{
  if (this->dictionary == ancestor)
  {
    ancestor->balanceFactor = '=';
  }
  else
  {
    t_word *t = ancestor;
    if (newLeaf->hash <= ancestor->parent->hash)
    {
      ancestor->balanceFactor = 'R';
      t = ancestor->parent->left;
    }
    else
    {
      ancestor->balanceFactor = '=';
      ancestor->parent->left->balanceFactor = 'L';
    }
    balanceFactors (t, newLeaf);
  }
}

void wordlist::balanceRightLeft (t_word *ancestor, t_word *newLeaf)
{
  if (this->dictionary == ancestor)
  {
    ancestor->balanceFactor = '=';
  }
  else
  {
    t_word *t = ancestor;
    if (newLeaf->hash > ancestor->parent->hash)
    {
      ancestor->balanceFactor = 'L';
      t = ancestor->parent->right;
    }
    else
    {
      ancestor->balanceFactor = '=';
      ancestor->parent->left->balanceFactor = 'R';
    }
    balanceFactors (t, newLeaf);
  }
}

void wordlist::balance (t_word *ancestor, t_word *newLeaf)
{
  // Balanced tree
  if (NULL == ancestor)
  {
    if (newLeaf->hash <= this->dictionary->hash)
    {
      this->dictionary->balanceFactor = 'L';
    }
    else
    {
      this->dictionary->balanceFactor = 'R';
    }
    balanceFactors (this->dictionary, newLeaf);
  }
  // Added to the other side of the balanced factor
  else if ((ancestor->balanceFactor == 'L' &&
            newLeaf->hash > ancestor->hash) ||
           (ancestor->balanceFactor == 'R' &&
            newLeaf->hash <= ancestor->hash))
  {
    ancestor->balanceFactor = '=';
    balanceFactors (ancestor, newLeaf);
  }
  // Added to the same side of the ancestor's child tree
  else if ((ancestor->balanceFactor == 'R' &&
            newLeaf->hash > ancestor->right->hash) ||
           (ancestor->balanceFactor == 'L' &&
            newLeaf->hash <= ancestor->left->hash))
  {
    char bf = ancestor->balanceFactor;
    ancestor->balanceFactor = '=';
    switch (bf)
    {
      case 'R':
        rotateLeft(ancestor);
        break;
      case 'L':
        rotateRight(ancestor);
        break;
    }
    balanceFactors (ancestor->parent, newLeaf);
  }
  // Added to the right of ancestor's left child
  else if (ancestor->balanceFactor == 'L' &&
           newLeaf->hash > ancestor->left->hash)
  {
    rotateLeft(ancestor->left);
    rotateRight(ancestor);
    balanceLeftRight (ancestor, newLeaf);
  }
  // Added to the left of ancestor's right child
  else if (ancestor->balanceFactor == 'R' &&
           newLeaf->hash <= ancestor->right->hash)
  {
    rotateRight(ancestor->right);
    rotateLeft(ancestor);
    balanceRightLeft (ancestor, newLeaf);
  }
  else
  {
    cerr << "[Error]: Not a valid operation." << endl;
    exit (1);
  }
}

void wordlist::insert (string str)
{
  t_word *t = create (str);
  t_word *ancestor = NULL;
  if (NULL == this->dictionary)
  {
    this->dictionary = t;
  }
  else
  {
    t_word *r = this->dictionary;
    while (NULL != r)
    {
      if (r->balanceFactor != '=')
        ancestor = r;
      
      if (t->hash < r->hash)
      {
        if (NULL != r->left)
        {
          r =r->left;
        }
        else
        {
          break;
        }
      }
      else if (t->hash > r->hash)
      {
        if (NULL != r->right)
        {
          r =r->right;
        }
        else
        {
          break;
        }
      }
      else
      {
        while (NULL != r->next)
          r = r->next;
        
        r->next = t;
        return;
      }
    }
    
    t->parent = r;
    
    if (t->hash <= r->hash)
    {
      r->left = t;
    }
    else
    {
      r->right = t;
    }

    // Balance the tree using AVL
    balance (ancestor, t);
  }
}

int wordlist::getScore (string s1, string s2)
{
  int i = 0;
  int j = 0;
  int len = s2.length() * s1.length();
  int *a_scores = new int [len];
  int score = 0;

  /*
  cout << "  ";
  for (i=0; i<s1.length(); i++)
  {
    cout << s1[i] << " ";
  }
  cout << endl;
  */

  for (j=0; j<s2.length(); j++)
  {
    // cout << s2[j] << " ";
    for (i=0; i<s1.length(); i++)
    {
      if (s1[i] == s2[j])
      {
        if (i == 0 || j == 0)
        {
          *(a_scores+(j*s1.length())+i) = 1;
        }
        else
        {
          int x = i-1;
          int y = j-1;
          int t_score = 0;
          for (;x>=0 && y>=0; x--,y--)
          {
            if (*(a_scores+(y*s1.length())+x) != 0)
            {
              t_score = *(a_scores+(y*s1.length())+x);
              break;
            }
          }
          for (x=i-1; x>=0; x--)
          {
            if (*(a_scores+(j*s1.length())+x) != 0)
            {
              if (t_score < *(a_scores+(j*s1.length())+x))
              {
                t_score = *(a_scores+(j*s1.length())+x) - 1;
              }
              break;
            }
          } 
          for (y=j-1; y>=0; y--)
          {
            if (*(a_scores+(y*s1.length())+i) != 0)
            {
              if (t_score < *(a_scores+(y*s1.length())+i))
              {
                t_score = *(a_scores+(y*s1.length())+i) - 1;
              }
              break;
            }
          }
          *(a_scores+(j*s1.length())+i) = t_score + 1;
        }
        if (score < *(a_scores+(j*s1.length())+i))
        {
          score = *(a_scores+(j*s1.length())+i);
        }
      }
      else
      {
        *(a_scores+(j*s1.length())+i) = 0;
      }
      // cout << *(a_scores+(j*s1.length())+i) << " ";
    }
    // cout << endl;
  }

  score = s1.length()-score;

  delete[] a_scores;

  return score;
}

int wordlist::findMatch (string str, vector<string>& result)
{
  unsigned int hash = calcHash(str);
  t_word *t = this->dictionary;
  unsigned int count = 0;
  unsigned int level = 0;

  while (NULL != t)
  {
    level ++;
    if (hash < t->hash)
    {
      t = t->left;
    }
    else if (hash > t->hash)
    {
      t = t->right;
    }
    else
    {
      while (NULL != t)
      {
        result.resize(count+1);
        result.at(count) = t->word;
        t = t->next;
        count++;
      }
      return level;
    }
  }

  return level;
}
Filename: spellCheck.cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <iomanip>
#include <sstream>
#include <string>
#include <algorithm>

using namespace std;

#include "dictionary.hpp"

int main ()
{
  wordlist mydict ("wordsEn.txt");
  // My Dictionary
  mydict.generateDictionary ();

  string str;
  // My wordlist which are to be checked
  ifstream ifs ("wordlist.txt");
  while (getline (ifs, str))
  {
    str.resize (str.length()-1);

    vector <string> matches;
    int count  = mydict.findMatch (str, matches);
        
    if (0 < matches.size())
    { 
      for (long index=0; index < matches.size(); index++)
      {
        int score = mydict.getScore (str, matches.at(index));
        ostringstream ss;
        ss << setw(2) << setfill('0') << score;
        matches.at(index) = ss.str() + matches.at(index);

      }
      sort (matches.begin(), matches.end());
      string tmp = matches.at(0);

      // Exact match. Goto the next word in the list
      if ('0' == tmp[0] && '0' == tmp[1])
      {
        cout << "Spelling is CORRECT for (" <<  str << ")" <<endl;
        continue;
      }
    }

    // Add each character from 'a' to 'z' one by one and check if we 
    // get more suggesstions.
    string alpha = "abcdefghijklmnopqrstuvwxyz";
    vector<string> tmp_matches;
    
    for (int i=0; i<alpha.length(); i++)
    {
      mydict.findMatch (str+alpha[i], tmp_matches);
      for (long index=0; index < tmp_matches.size(); index++)
      {

        int score = mydict.getScore (str, tmp_matches.at(index));
        ostringstream ss;
        ss << setw(2) << setfill('0') << score + 1;
        matches.push_back(ss.str() + tmp_matches.at(index));
      }
      tmp_matches.clear();
    }
    
    // Check the suggesstions for the strings by removing one character 
    // in different index and try all the posible index values.
    for (int i=0; i<str.length(); i++)
    {
      string tmp_str = str;
      tmp_str.erase (i, 1);
      mydict.findMatch (tmp_str, tmp_matches);
      for (long index=0; index < tmp_matches.size(); index++)
      {
        int score = mydict.getScore (tmp_str, tmp_matches.at(index));
        ostringstream ss;
        ss << setw(2) << setfill('0') << score + 1;
        matches.push_back(ss.str() + tmp_matches.at(index));
      }
      tmp_matches.clear();
    }

    if (0 < matches.size())
    {
      // Sort all the suggesstions with thr rank
      sort (matches.begin(), matches.end());

      // Remove the rank element from the strings
      for (long index=0; index < matches.size(); index++)
      {
        matches.at(index).erase(0,2);
      }
    
      // Removes duplicate consecutive elements  
      // matches.erase( unique( matches.begin(), matches.end() ), matches.end() );
    
      // Remove all the duplicates
      vector<string> uniqueMatches;
      uniqueMatches.push_back(matches.at(0));
      for (long index=1; index < matches.size(); index++)
      {
        if (uniqueMatches.end() == find (uniqueMatches.begin(), uniqueMatches.end(), matches.at(index)))
        {
          uniqueMatches.push_back(matches.at(index));
        }
      }
    
      // Final result
      cout << "Matcing for (" << str << "): { ";
      for (long index=0; index < uniqueMatches.size(); index++)
      {
        cout <<uniqueMatches.at(index) << " ";
      }
      cout << "}" << endl;
    }
    else
    {
      cout << count << " Matche not found for (" << str << ")" << endl;
    }
  }

  return 0;
}
You can find the wordlist.txt from here

No comments :

Post a Comment