My hash function gives me same hash value for the strings even if the character are jumbled. For example, "add" and "dad" will have the same hash value. I collect the list of words which have the same has value as the given string and find how close these words to the given string. I sort these suggestions in the order where the closest possibility comes first.
To get more words, i repeat this with adding a character each time from 'a' to 'z' with the given string and one more time with removing the character from all the possible indices.
Finally print the list.
Filename: dictionary.hpp
#ifndef __MY_DICTIONARY__ #define __MY_DICTIONARY__ #include <iostream> #include <string> using namespace std; typedef struct word t_word; class wordlist { private: string dictionaryfile; t_word *dictionary; public: wordlist (string dictfile) { this->dictionaryfile = dictfile; this->dictionary = NULL; } wordlist() { } void setWordFile (string wordfile) { this->dictionaryfile = wordfile; } string getWordFile () { return this->dictionaryfile; } ~wordlist () { } bool generateDictionary (); unsigned int calcHash (string); t_word *create (string); void rotateLeft (t_word*); void rotateRight (t_word*); void balanceFactors (t_word*, t_word*); void balanceLeftRight (t_word*, t_word*); void balanceRightLeft (t_word*, t_word*); void balance (t_word*, t_word*); void insert (string); int getScore (string, string); int findMatch (string, vector<string>&); }; #endifFilename: dictionary.cpp
#include <iostream> #include <vector> #include <string> #include <fstream> #include <stdlib.h> using namespace std; #include "dictionary.hpp" // Node to store the word and the corresponding hash values struct word{ string word; unsigned int hash; // Pointers to create the self-balanced binary search tree t_word *parent; t_word *left; t_word *right; char balanceFactor; // Pointer to store the words with the same hash value t_word *next; }; // Generates the dictionary (self-balanced binary search tree) // with the list of words from the given dictionary file. bool wordlist::generateDictionary () { ifstream ifs (this->dictionaryfile.c_str()); string str; try { while (getline (ifs, str)) { // Remove the new line str.resize (str.length()-1); // Add the word into the dictionary insert (str); } ifs.close(); } catch (exception& e) { cerr << "[Error]: " << e.what() << endl; if (ifs.is_open()) { ifs.close(); } return false; } return true; } // Calculates the hash value for the given word // Position of the characters does not have importance here. unsigned int wordlist::calcHash (string word) { unsigned int hash = 0; int val[26] = {0}; const char *exp = word.c_str(); while (('a' <= *exp && 'z' >= *exp) || '\'' == *exp) { hash = hash + (( (int(*exp) & 0x01) * 0xcf * 3+ (int(*exp) & 0x02) * 0xbf * 37+ (int(*exp) & 0x04) * 0xaf * 79+ (int(*exp) & 0x08) * 0x9f * 131+ (int(*exp) & 0x10) * 0x8f * 181+ (int(*exp) & 0x20) * 0x7f * 239+ (int(*exp) & 0x40) * 0x6f * 293+ (int(*exp) & 0x80) * 0x5f * 359) * (int (*exp) - int ('a') + 1) * 421 ) + (int (*exp) - int ('a') + 1) * 17; exp++; } hash += word.length(); return hash; } // Create a new node with the given string and its has value t_word* wordlist::create (string str) { unsigned int hash = calcHash (str); t_word *t = new t_word; if (NULL == t) { cerr << "[Error] memory allocation failed." << endl; exit (1); } // cout << hash << " [" << str << "]" << endl; t->word = str; t->hash = hash; t->parent = NULL; t->left = NULL; t->right = NULL; t->next = NULL; t->balanceFactor = '='; return t; } void wordlist::rotateLeft (t_word *n) { t_word *t = n->right; n->right = t->left; if (NULL != t->left) { t->left->parent = n; } if (NULL != n->parent) { if (n->parent->left == n) { n->parent->left = t; } else { n->parent->right = t; } } else { this->dictionary = t; t->parent = NULL; } t->left = n; t->parent = n->parent; n->parent = t; } void wordlist::rotateRight (t_word *n) { t_word *t = n->left; if (NULL == n->left && NULL == n->right) return; n->left = t->right; if (NULL != t->right) { t->right->parent = n; } if (NULL != n->parent) { if (n->parent->left == n) { n->parent->left = t; } else { n->parent->right = t; } } else { this->dictionary = t; t->parent = NULL; } t->right = n; t->parent = n->parent; n->parent = t; } void wordlist::balanceFactors (t_word *ancestor, t_word *newLeaf) { t_word *t = newLeaf->parent; while (t != NULL && t != ancestor) { if (newLeaf->hash <= t->hash) t->balanceFactor = 'L'; else t->balanceFactor = 'R'; t = t->parent; } } void wordlist::balanceLeftRight (t_word *ancestor, t_word *newLeaf) { if (this->dictionary == ancestor) { ancestor->balanceFactor = '='; } else { t_word *t = ancestor; if (newLeaf->hash <= ancestor->parent->hash) { ancestor->balanceFactor = 'R'; t = ancestor->parent->left; } else { ancestor->balanceFactor = '='; ancestor->parent->left->balanceFactor = 'L'; } balanceFactors (t, newLeaf); } } void wordlist::balanceRightLeft (t_word *ancestor, t_word *newLeaf) { if (this->dictionary == ancestor) { ancestor->balanceFactor = '='; } else { t_word *t = ancestor; if (newLeaf->hash > ancestor->parent->hash) { ancestor->balanceFactor = 'L'; t = ancestor->parent->right; } else { ancestor->balanceFactor = '='; ancestor->parent->left->balanceFactor = 'R'; } balanceFactors (t, newLeaf); } } void wordlist::balance (t_word *ancestor, t_word *newLeaf) { // Balanced tree if (NULL == ancestor) { if (newLeaf->hash <= this->dictionary->hash) { this->dictionary->balanceFactor = 'L'; } else { this->dictionary->balanceFactor = 'R'; } balanceFactors (this->dictionary, newLeaf); } // Added to the other side of the balanced factor else if ((ancestor->balanceFactor == 'L' && newLeaf->hash > ancestor->hash) || (ancestor->balanceFactor == 'R' && newLeaf->hash <= ancestor->hash)) { ancestor->balanceFactor = '='; balanceFactors (ancestor, newLeaf); } // Added to the same side of the ancestor's child tree else if ((ancestor->balanceFactor == 'R' && newLeaf->hash > ancestor->right->hash) || (ancestor->balanceFactor == 'L' && newLeaf->hash <= ancestor->left->hash)) { char bf = ancestor->balanceFactor; ancestor->balanceFactor = '='; switch (bf) { case 'R': rotateLeft(ancestor); break; case 'L': rotateRight(ancestor); break; } balanceFactors (ancestor->parent, newLeaf); } // Added to the right of ancestor's left child else if (ancestor->balanceFactor == 'L' && newLeaf->hash > ancestor->left->hash) { rotateLeft(ancestor->left); rotateRight(ancestor); balanceLeftRight (ancestor, newLeaf); } // Added to the left of ancestor's right child else if (ancestor->balanceFactor == 'R' && newLeaf->hash <= ancestor->right->hash) { rotateRight(ancestor->right); rotateLeft(ancestor); balanceRightLeft (ancestor, newLeaf); } else { cerr << "[Error]: Not a valid operation." << endl; exit (1); } } void wordlist::insert (string str) { t_word *t = create (str); t_word *ancestor = NULL; if (NULL == this->dictionary) { this->dictionary = t; } else { t_word *r = this->dictionary; while (NULL != r) { if (r->balanceFactor != '=') ancestor = r; if (t->hash < r->hash) { if (NULL != r->left) { r =r->left; } else { break; } } else if (t->hash > r->hash) { if (NULL != r->right) { r =r->right; } else { break; } } else { while (NULL != r->next) r = r->next; r->next = t; return; } } t->parent = r; if (t->hash <= r->hash) { r->left = t; } else { r->right = t; } // Balance the tree using AVL balance (ancestor, t); } } int wordlist::getScore (string s1, string s2) { int i = 0; int j = 0; int len = s2.length() * s1.length(); int *a_scores = new int [len]; int score = 0; /* cout << " "; for (i=0; i<s1.length(); i++) { cout << s1[i] << " "; } cout << endl; */ for (j=0; j<s2.length(); j++) { // cout << s2[j] << " "; for (i=0; i<s1.length(); i++) { if (s1[i] == s2[j]) { if (i == 0 || j == 0) { *(a_scores+(j*s1.length())+i) = 1; } else { int x = i-1; int y = j-1; int t_score = 0; for (;x>=0 && y>=0; x--,y--) { if (*(a_scores+(y*s1.length())+x) != 0) { t_score = *(a_scores+(y*s1.length())+x); break; } } for (x=i-1; x>=0; x--) { if (*(a_scores+(j*s1.length())+x) != 0) { if (t_score < *(a_scores+(j*s1.length())+x)) { t_score = *(a_scores+(j*s1.length())+x) - 1; } break; } } for (y=j-1; y>=0; y--) { if (*(a_scores+(y*s1.length())+i) != 0) { if (t_score < *(a_scores+(y*s1.length())+i)) { t_score = *(a_scores+(y*s1.length())+i) - 1; } break; } } *(a_scores+(j*s1.length())+i) = t_score + 1; } if (score < *(a_scores+(j*s1.length())+i)) { score = *(a_scores+(j*s1.length())+i); } } else { *(a_scores+(j*s1.length())+i) = 0; } // cout << *(a_scores+(j*s1.length())+i) << " "; } // cout << endl; } score = s1.length()-score; delete[] a_scores; return score; } int wordlist::findMatch (string str, vector<string>& result) { unsigned int hash = calcHash(str); t_word *t = this->dictionary; unsigned int count = 0; unsigned int level = 0; while (NULL != t) { level ++; if (hash < t->hash) { t = t->left; } else if (hash > t->hash) { t = t->right; } else { while (NULL != t) { result.resize(count+1); result.at(count) = t->word; t = t->next; count++; } return level; } } return level; }Filename: spellCheck.cpp
#include <iostream> #include <fstream> #include <vector> #include <iomanip> #include <sstream> #include <string> #include <algorithm> using namespace std; #include "dictionary.hpp" int main () { wordlist mydict ("wordsEn.txt"); // My Dictionary mydict.generateDictionary (); string str; // My wordlist which are to be checked ifstream ifs ("wordlist.txt"); while (getline (ifs, str)) { str.resize (str.length()-1); vector <string> matches; int count = mydict.findMatch (str, matches); if (0 < matches.size()) { for (long index=0; index < matches.size(); index++) { int score = mydict.getScore (str, matches.at(index)); ostringstream ss; ss << setw(2) << setfill('0') << score; matches.at(index) = ss.str() + matches.at(index); } sort (matches.begin(), matches.end()); string tmp = matches.at(0); // Exact match. Goto the next word in the list if ('0' == tmp[0] && '0' == tmp[1]) { cout << "Spelling is CORRECT for (" << str << ")" <<endl; continue; } } // Add each character from 'a' to 'z' one by one and check if we // get more suggesstions. string alpha = "abcdefghijklmnopqrstuvwxyz"; vector<string> tmp_matches; for (int i=0; i<alpha.length(); i++) { mydict.findMatch (str+alpha[i], tmp_matches); for (long index=0; index < tmp_matches.size(); index++) { int score = mydict.getScore (str, tmp_matches.at(index)); ostringstream ss; ss << setw(2) << setfill('0') << score + 1; matches.push_back(ss.str() + tmp_matches.at(index)); } tmp_matches.clear(); } // Check the suggesstions for the strings by removing one character // in different index and try all the posible index values. for (int i=0; i<str.length(); i++) { string tmp_str = str; tmp_str.erase (i, 1); mydict.findMatch (tmp_str, tmp_matches); for (long index=0; index < tmp_matches.size(); index++) { int score = mydict.getScore (tmp_str, tmp_matches.at(index)); ostringstream ss; ss << setw(2) << setfill('0') << score + 1; matches.push_back(ss.str() + tmp_matches.at(index)); } tmp_matches.clear(); } if (0 < matches.size()) { // Sort all the suggesstions with thr rank sort (matches.begin(), matches.end()); // Remove the rank element from the strings for (long index=0; index < matches.size(); index++) { matches.at(index).erase(0,2); } // Removes duplicate consecutive elements // matches.erase( unique( matches.begin(), matches.end() ), matches.end() ); // Remove all the duplicates vector<string> uniqueMatches; uniqueMatches.push_back(matches.at(0)); for (long index=1; index < matches.size(); index++) { if (uniqueMatches.end() == find (uniqueMatches.begin(), uniqueMatches.end(), matches.at(index))) { uniqueMatches.push_back(matches.at(index)); } } // Final result cout << "Matcing for (" << str << "): { "; for (long index=0; index < uniqueMatches.size(); index++) { cout <<uniqueMatches.at(index) << " "; } cout << "}" << endl; } else { cout << count << " Matche not found for (" << str << ")" << endl; } } return 0; }You can find the wordlist.txt from here
No comments :
Post a Comment