My hash function gives me same hash value for the strings even if the character are jumbled. For example, "add" and "dad" will have the same hash value. I collect the list of words which have the same has value as the given string and find how close these words to the given string. I sort these suggestions in the order where the closest possibility comes first.
To get more words, i repeat this with adding a character each time from 'a' to 'z' with the given string and one more time with removing the character from all the possible indices.
Finally print the list.
Filename: dictionary.hpp
#ifndef __MY_DICTIONARY__
#define __MY_DICTIONARY__
#include <iostream>
#include <string>
using namespace std;
typedef struct word t_word;
class wordlist {
private:
string dictionaryfile;
t_word *dictionary;
public:
wordlist (string dictfile)
{
this->dictionaryfile = dictfile;
this->dictionary = NULL;
}
wordlist()
{
}
void setWordFile (string wordfile)
{
this->dictionaryfile = wordfile;
}
string getWordFile ()
{
return this->dictionaryfile;
}
~wordlist ()
{
}
bool generateDictionary ();
unsigned int calcHash (string);
t_word *create (string);
void rotateLeft (t_word*);
void rotateRight (t_word*);
void balanceFactors (t_word*, t_word*);
void balanceLeftRight (t_word*, t_word*);
void balanceRightLeft (t_word*, t_word*);
void balance (t_word*, t_word*);
void insert (string);
int getScore (string, string);
int findMatch (string, vector<string>&);
};
#endif
Filename: dictionary.cpp
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <stdlib.h>
using namespace std;
#include "dictionary.hpp"
// Node to store the word and the corresponding hash values
struct word{
string word;
unsigned int hash;
// Pointers to create the self-balanced binary search tree
t_word *parent;
t_word *left;
t_word *right;
char balanceFactor;
// Pointer to store the words with the same hash value
t_word *next;
};
// Generates the dictionary (self-balanced binary search tree)
// with the list of words from the given dictionary file.
bool wordlist::generateDictionary ()
{
ifstream ifs (this->dictionaryfile.c_str());
string str;
try
{
while (getline (ifs, str))
{
// Remove the new line
str.resize (str.length()-1);
// Add the word into the dictionary
insert (str);
}
ifs.close();
}
catch (exception& e)
{
cerr << "[Error]: " << e.what() << endl;
if (ifs.is_open())
{
ifs.close();
}
return false;
}
return true;
}
// Calculates the hash value for the given word
// Position of the characters does not have importance here.
unsigned int wordlist::calcHash (string word)
{
unsigned int hash = 0;
int val[26] = {0};
const char *exp = word.c_str();
while (('a' <= *exp &&
'z' >= *exp) ||
'\'' == *exp)
{
hash = hash + ((
(int(*exp) & 0x01) * 0xcf * 3+
(int(*exp) & 0x02) * 0xbf * 37+
(int(*exp) & 0x04) * 0xaf * 79+
(int(*exp) & 0x08) * 0x9f * 131+
(int(*exp) & 0x10) * 0x8f * 181+
(int(*exp) & 0x20) * 0x7f * 239+
(int(*exp) & 0x40) * 0x6f * 293+
(int(*exp) & 0x80) * 0x5f * 359)
* (int (*exp) - int ('a') + 1) * 421 ) +
(int (*exp) - int ('a') + 1) * 17;
exp++;
}
hash += word.length();
return hash;
}
// Create a new node with the given string and its has value
t_word* wordlist::create (string str)
{
unsigned int hash = calcHash (str);
t_word *t = new t_word;
if (NULL == t)
{
cerr << "[Error] memory allocation failed." << endl;
exit (1);
}
// cout << hash << " [" << str << "]" << endl;
t->word = str;
t->hash = hash;
t->parent = NULL;
t->left = NULL;
t->right = NULL;
t->next = NULL;
t->balanceFactor = '=';
return t;
}
void wordlist::rotateLeft (t_word *n)
{
t_word *t = n->right;
n->right = t->left;
if (NULL != t->left)
{
t->left->parent = n;
}
if (NULL != n->parent)
{
if (n->parent->left == n)
{
n->parent->left = t;
}
else
{
n->parent->right = t;
}
}
else
{
this->dictionary = t;
t->parent = NULL;
}
t->left = n;
t->parent = n->parent;
n->parent = t;
}
void wordlist::rotateRight (t_word *n)
{
t_word *t = n->left;
if (NULL == n->left && NULL == n->right)
return;
n->left = t->right;
if (NULL != t->right)
{
t->right->parent = n;
}
if (NULL != n->parent)
{
if (n->parent->left == n)
{
n->parent->left = t;
}
else
{
n->parent->right = t;
}
}
else
{
this->dictionary = t;
t->parent = NULL;
}
t->right = n;
t->parent = n->parent;
n->parent = t;
}
void wordlist::balanceFactors (t_word *ancestor, t_word *newLeaf)
{
t_word *t = newLeaf->parent;
while (t != NULL && t != ancestor)
{
if (newLeaf->hash <= t->hash)
t->balanceFactor = 'L';
else
t->balanceFactor = 'R';
t = t->parent;
}
}
void wordlist::balanceLeftRight (t_word *ancestor, t_word *newLeaf)
{
if (this->dictionary == ancestor)
{
ancestor->balanceFactor = '=';
}
else
{
t_word *t = ancestor;
if (newLeaf->hash <= ancestor->parent->hash)
{
ancestor->balanceFactor = 'R';
t = ancestor->parent->left;
}
else
{
ancestor->balanceFactor = '=';
ancestor->parent->left->balanceFactor = 'L';
}
balanceFactors (t, newLeaf);
}
}
void wordlist::balanceRightLeft (t_word *ancestor, t_word *newLeaf)
{
if (this->dictionary == ancestor)
{
ancestor->balanceFactor = '=';
}
else
{
t_word *t = ancestor;
if (newLeaf->hash > ancestor->parent->hash)
{
ancestor->balanceFactor = 'L';
t = ancestor->parent->right;
}
else
{
ancestor->balanceFactor = '=';
ancestor->parent->left->balanceFactor = 'R';
}
balanceFactors (t, newLeaf);
}
}
void wordlist::balance (t_word *ancestor, t_word *newLeaf)
{
// Balanced tree
if (NULL == ancestor)
{
if (newLeaf->hash <= this->dictionary->hash)
{
this->dictionary->balanceFactor = 'L';
}
else
{
this->dictionary->balanceFactor = 'R';
}
balanceFactors (this->dictionary, newLeaf);
}
// Added to the other side of the balanced factor
else if ((ancestor->balanceFactor == 'L' &&
newLeaf->hash > ancestor->hash) ||
(ancestor->balanceFactor == 'R' &&
newLeaf->hash <= ancestor->hash))
{
ancestor->balanceFactor = '=';
balanceFactors (ancestor, newLeaf);
}
// Added to the same side of the ancestor's child tree
else if ((ancestor->balanceFactor == 'R' &&
newLeaf->hash > ancestor->right->hash) ||
(ancestor->balanceFactor == 'L' &&
newLeaf->hash <= ancestor->left->hash))
{
char bf = ancestor->balanceFactor;
ancestor->balanceFactor = '=';
switch (bf)
{
case 'R':
rotateLeft(ancestor);
break;
case 'L':
rotateRight(ancestor);
break;
}
balanceFactors (ancestor->parent, newLeaf);
}
// Added to the right of ancestor's left child
else if (ancestor->balanceFactor == 'L' &&
newLeaf->hash > ancestor->left->hash)
{
rotateLeft(ancestor->left);
rotateRight(ancestor);
balanceLeftRight (ancestor, newLeaf);
}
// Added to the left of ancestor's right child
else if (ancestor->balanceFactor == 'R' &&
newLeaf->hash <= ancestor->right->hash)
{
rotateRight(ancestor->right);
rotateLeft(ancestor);
balanceRightLeft (ancestor, newLeaf);
}
else
{
cerr << "[Error]: Not a valid operation." << endl;
exit (1);
}
}
void wordlist::insert (string str)
{
t_word *t = create (str);
t_word *ancestor = NULL;
if (NULL == this->dictionary)
{
this->dictionary = t;
}
else
{
t_word *r = this->dictionary;
while (NULL != r)
{
if (r->balanceFactor != '=')
ancestor = r;
if (t->hash < r->hash)
{
if (NULL != r->left)
{
r =r->left;
}
else
{
break;
}
}
else if (t->hash > r->hash)
{
if (NULL != r->right)
{
r =r->right;
}
else
{
break;
}
}
else
{
while (NULL != r->next)
r = r->next;
r->next = t;
return;
}
}
t->parent = r;
if (t->hash <= r->hash)
{
r->left = t;
}
else
{
r->right = t;
}
// Balance the tree using AVL
balance (ancestor, t);
}
}
int wordlist::getScore (string s1, string s2)
{
int i = 0;
int j = 0;
int len = s2.length() * s1.length();
int *a_scores = new int [len];
int score = 0;
/*
cout << " ";
for (i=0; i<s1.length(); i++)
{
cout << s1[i] << " ";
}
cout << endl;
*/
for (j=0; j<s2.length(); j++)
{
// cout << s2[j] << " ";
for (i=0; i<s1.length(); i++)
{
if (s1[i] == s2[j])
{
if (i == 0 || j == 0)
{
*(a_scores+(j*s1.length())+i) = 1;
}
else
{
int x = i-1;
int y = j-1;
int t_score = 0;
for (;x>=0 && y>=0; x--,y--)
{
if (*(a_scores+(y*s1.length())+x) != 0)
{
t_score = *(a_scores+(y*s1.length())+x);
break;
}
}
for (x=i-1; x>=0; x--)
{
if (*(a_scores+(j*s1.length())+x) != 0)
{
if (t_score < *(a_scores+(j*s1.length())+x))
{
t_score = *(a_scores+(j*s1.length())+x) - 1;
}
break;
}
}
for (y=j-1; y>=0; y--)
{
if (*(a_scores+(y*s1.length())+i) != 0)
{
if (t_score < *(a_scores+(y*s1.length())+i))
{
t_score = *(a_scores+(y*s1.length())+i) - 1;
}
break;
}
}
*(a_scores+(j*s1.length())+i) = t_score + 1;
}
if (score < *(a_scores+(j*s1.length())+i))
{
score = *(a_scores+(j*s1.length())+i);
}
}
else
{
*(a_scores+(j*s1.length())+i) = 0;
}
// cout << *(a_scores+(j*s1.length())+i) << " ";
}
// cout << endl;
}
score = s1.length()-score;
delete[] a_scores;
return score;
}
int wordlist::findMatch (string str, vector<string>& result)
{
unsigned int hash = calcHash(str);
t_word *t = this->dictionary;
unsigned int count = 0;
unsigned int level = 0;
while (NULL != t)
{
level ++;
if (hash < t->hash)
{
t = t->left;
}
else if (hash > t->hash)
{
t = t->right;
}
else
{
while (NULL != t)
{
result.resize(count+1);
result.at(count) = t->word;
t = t->next;
count++;
}
return level;
}
}
return level;
}
Filename: spellCheck.cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <iomanip>
#include <sstream>
#include <string>
#include <algorithm>
using namespace std;
#include "dictionary.hpp"
int main ()
{
wordlist mydict ("wordsEn.txt");
// My Dictionary
mydict.generateDictionary ();
string str;
// My wordlist which are to be checked
ifstream ifs ("wordlist.txt");
while (getline (ifs, str))
{
str.resize (str.length()-1);
vector <string> matches;
int count = mydict.findMatch (str, matches);
if (0 < matches.size())
{
for (long index=0; index < matches.size(); index++)
{
int score = mydict.getScore (str, matches.at(index));
ostringstream ss;
ss << setw(2) << setfill('0') << score;
matches.at(index) = ss.str() + matches.at(index);
}
sort (matches.begin(), matches.end());
string tmp = matches.at(0);
// Exact match. Goto the next word in the list
if ('0' == tmp[0] && '0' == tmp[1])
{
cout << "Spelling is CORRECT for (" << str << ")" <<endl;
continue;
}
}
// Add each character from 'a' to 'z' one by one and check if we
// get more suggesstions.
string alpha = "abcdefghijklmnopqrstuvwxyz";
vector<string> tmp_matches;
for (int i=0; i<alpha.length(); i++)
{
mydict.findMatch (str+alpha[i], tmp_matches);
for (long index=0; index < tmp_matches.size(); index++)
{
int score = mydict.getScore (str, tmp_matches.at(index));
ostringstream ss;
ss << setw(2) << setfill('0') << score + 1;
matches.push_back(ss.str() + tmp_matches.at(index));
}
tmp_matches.clear();
}
// Check the suggesstions for the strings by removing one character
// in different index and try all the posible index values.
for (int i=0; i<str.length(); i++)
{
string tmp_str = str;
tmp_str.erase (i, 1);
mydict.findMatch (tmp_str, tmp_matches);
for (long index=0; index < tmp_matches.size(); index++)
{
int score = mydict.getScore (tmp_str, tmp_matches.at(index));
ostringstream ss;
ss << setw(2) << setfill('0') << score + 1;
matches.push_back(ss.str() + tmp_matches.at(index));
}
tmp_matches.clear();
}
if (0 < matches.size())
{
// Sort all the suggesstions with thr rank
sort (matches.begin(), matches.end());
// Remove the rank element from the strings
for (long index=0; index < matches.size(); index++)
{
matches.at(index).erase(0,2);
}
// Removes duplicate consecutive elements
// matches.erase( unique( matches.begin(), matches.end() ), matches.end() );
// Remove all the duplicates
vector<string> uniqueMatches;
uniqueMatches.push_back(matches.at(0));
for (long index=1; index < matches.size(); index++)
{
if (uniqueMatches.end() == find (uniqueMatches.begin(), uniqueMatches.end(), matches.at(index)))
{
uniqueMatches.push_back(matches.at(index));
}
}
// Final result
cout << "Matcing for (" << str << "): { ";
for (long index=0; index < uniqueMatches.size(); index++)
{
cout <<uniqueMatches.at(index) << " ";
}
cout << "}" << endl;
}
else
{
cout << count << " Matche not found for (" << str << ")" << endl;
}
}
return 0;
}
You can find the wordlist.txt from here
No comments :
Post a Comment