// David Ferris - Capstone 2014 - Main.cpp // Last Updated 5/12/2014 #include #include #include #include #include #include "Word.h" #include "Hash.h" using namespace std; //---------------------- Prototypes ---------------------- int DefineGlobalArrays(); int AddWordsFromSentence(int index, Hash & word_table); int SplitIntoSentences(); int WriteResultsToFile(Hash * words, int sentences_count); void QuickSort(int A[], string B[], int first, int last); int Split(int A[], string B[], int first, int last); string RemoveSuffix(string word); void LowercaseString(string & text); bool IsIgnoredWord(string word); int GetFileData(); int FindNearestPrime(int beginning_value); bool IsPrime(int val); bool TitleDetected(string sentence); //------------------- Global Variables ------------------- int CHAR_ENDS_SENTENCE [256]; // Signifies whether each char (ascii 0-255) is sentence-ending int SPECIAL_CHARS [256]; // Signifies whether each char (ascii 0-255) is special (i.e. not alphabetical) const int SUFFIX_COUNT = 3; const string SUFFIXES [] = { "ing", "ed", "ly" }; // Suffixes that we want to remove from words int IGNORED_WORDS_COUNT; // Tracks the size of the IGNORED_WORDS array. string * IGNORED_WORDS; // Points to the array of words that we want to ignore. // The file containing these words was taken from Michael Klosiewski's 2013 project and modified const int TOP_WORDS = 5; // Specifies how many words and their data the user wants to be written to the resulting text file. // This data will be used to render the webpage. int main() { if (DefineGlobalArrays() == -1) { cout << "ERROR: DefineGlobalArrays failed. Exiting.." << endl; return 0; } // Since we want the Hash size to be nearly equal to the expected amount of items, and prime, according to Dr. McVey, we will // find the approx. amount of words, and then find the nearest prime number >= the word estimate int file_word_estimate = GetFileData(); // Find the nearest prime #, >= to the calculated size int calculated_hash_size = FindNearestPrime(file_word_estimate); // Dumping data cout << "Word Estimate: " << file_word_estimate << endl; cout << "Size for hash: " << calculated_hash_size << endl; // Create the Hash Hash word_table = Hash(calculated_hash_size); string * sentences = 0; // Split the input file into sentences - which are added the sentences file int sentence_count = SplitIntoSentences(); // Check to see if SplitIntoSentences errored if (sentence_count == -1) { cout << "ERROR: SplitIntoSentences failed. Exiting.." << endl; return 0; } int word_count = 0; for(int i = 0; i < sentence_count; i++) { word_count += AddWordsFromSentence(i, word_table); } //word_table.Dump(); // Check to see if WriteResultsToFile errored if (WriteResultsToFile(&word_table, sentence_count) == -1) { cout << "ERROR: WriteResultsToFile failed. Exiting.." << endl; return 0; } return 0; } /************************************************************ --------------------DefineGlobalArrays------------------------ The sole purpose of this function is to populate the CHAR_ENDS_SENTENCE, SPECIAL_CHARS, and IGNORED_WORDS arrays. In the 2 former arrays, a cell's index corresponds to an ascii character's decimal value. In the CHAR_ENDS_SENTENCE array, a 0 represents a character that does not end a sentence, and a 1 represents a character that does. Sentence Ending Chars = ! . ? (for now) In the SPECIAL_CHARS array, a 0 represents a character that is OK to include in a word. A 1 represents a character that we do not want to be a part of a word. For now, only the alphabet will be accepted into words. Lastly, the IGNORED_WORDS array is a string array, populated with words that I do not want to include in the Hash. These are words that are seen as being "useless" - meaning that they do not provide any insight as to the content of the overall text. This array is populated via a while loop, reading from the ignored_words.txt file. This file contains words from Michael Klosiewski's 2013 project, along with some additions that I felt were necessary. ************************************************************/ int DefineGlobalArrays() { // Initialize CHAR_ENDS_SENTENCE to straight 0's, and // SPECIAL_CHARS to straight 1's for(int i = 0; i < 256; i++) { CHAR_ENDS_SENTENCE[i] = 0; SPECIAL_CHARS[i] = 1; } // Set Sentence Ending Chars to 1 in CHAR_ENDS_SENTENCE CHAR_ENDS_SENTENCE[33] = 1; // ! CHAR_ENDS_SENTENCE[46] = 1; // . CHAR_ENDS_SENTENCE[63] = 1; // ? // Set alphabet to 0 in SPECIAL_CHARS for(int j = 65; j <= 90; j++) { SPECIAL_CHARS[j] = 0; // UPPER-CASE SPECIAL_CHARS[j + 32] = 0; // lower-case } // Fill the IGNORED_WORDS array with words from the ignored_words.txt file ifstream ignored_words_file; string filename = "ignored_words.txt"; string line = ""; int line_count = 0; string * temp; ignored_words_file.open(filename); if(!ignored_words_file.is_open()) { std::cout << "ERROR: ignored words file could not be opened. Exiting.." << endl; return -1; } // Allocate an IGNORED_WORDS array of size 0, to avoid creating more cases within the loop IGNORED_WORDS = new string [0]; while(1) { // Get the next word from the file getline(ignored_words_file, line); line_count++; // If the end of the file was reached, break the loop if(ignored_words_file.eof()) break; // Allocate memory for the temp array temp = new string [line_count]; // Transfer data from IGNORED_WORDS to temp for(int i = 0; i < (line_count - 1); i++) temp[i] = IGNORED_WORDS[i]; // Add the newest line to the temp array temp[line_count - 1] = line; // Delete the old IGNORED_WORDS ARRAY delete [] IGNORED_WORDS; // Point IGNORED_WORDS to temp IGNORED_WORDS = temp; // Point temp to null temp = 0; } IGNORED_WORDS_COUNT = line_count - 1; // Test dump the IGNORED_WORDS array /*for(int j = 0; j < (line_count - 1); j++) { cout << j << ": " << IGNORED_WORDS[j] << endl; }*/ } /************************************************************ ------------------AddWordsFromSentence-------------------- Takes a sentence and adds all words to the Hash. Splits on space , ; and . delimiters. (a) Grabs the next char in the sentence -- if the char is not a blank space, a semicolon, a comma, or a period, then it is added to the word. (b) If the char is a space, semicolon, comma, or period, then the word has ended. The char is not added to the word, and the word is then sent for further processing. First, any unwanted suffix is removed from the word, then the word is compared to the list of ignored words. If it is not found in the list, it is added to the Hash. Repeats until the end of the sentence is reached. ************************************************************/ int AddWordsFromSentence(int index, Hash & word_table) { ifstream sentence_input; string filename = "sentences.txt"; string word_string = ""; string modified_word = ""; // This will be the word with the suffix removed string sentence = ""; char current_char; int word_count = 0; sentence_input.open(filename); if(!sentence_input.is_open()) { cout << "ERROR: Sentences file could not be opened -- Program terminating" << endl; return 0; } for(int j = 0; j <= index; j++) getline(sentence_input, sentence); sentence_input.close(); for(int i = 0; i < sentence.length(); i++) { // (a) current_char = sentence[i]; // If the char is not a special char, add it to the word if(SPECIAL_CHARS[current_char] == 0) word_string += current_char; // (b) else { if(word_string.length() > 0) { // Convert the word to lowercase before processing LowercaseString(word_string); // Remove any suffix from the word modified_word = RemoveSuffix(word_string); // If the word is not in the list of ignored words, add it to the Hash if(!IsIgnoredWord(modified_word)) word_table.Add(Word(modified_word, index)); word_string = ""; word_count++; } } } return word_count; } /************************************************************ ---------------------SplitIntoSentences---------------------- - This function takes no parameters. - The function returns an int, which represents the amount of sentences in the text. (a) Variables are declared and the char gathering loop begins. chars will be taken from the text file one-at-a-time. (b) Gets the next char from the file. If the char is not a period, exclamation point, or question mark, it is added to the string tracking the sentence that is currently being built. This part also filters out the space after the previously mentioned 3 marks of punctuation. (c) If one of the 3 marks has been reached, then it is added to the sentence. At this point, we check to see if the '.' was a part of a title. If it was, the sentence continues to be built. If it was not, then he sentence is then written to the sentences text file, with a newline at the end. This allows us to keep a file with 1 sentence per line, so that we can avoid maintaining a large array of sentences. Repeat until the file input stream hits the end-of-file ************************************************************/ int SplitIntoSentences() { // (a) ifstream file_input; ofstream file_output; string input_filename = "sample_in.txt"; string output_filename = "sentences.txt"; char current_char = ' '; string current_sentence = ""; int sentence_count = 0; int char_count = 0; // Open the input and output files file_input.open(input_filename); file_output.open(output_filename); // If the input file was not opened, display error message and return if(!file_input.is_open()) { cout << "ERROR: Input file could not be opened -- Program terminating" << endl; return -1; } // If the input file was opened, display confirmation else cout << "File " << input_filename << " was successfully opened" << endl; // If the output file was not opened, display error message and return if(!file_output.is_open()) { cout << "ERROR: Output file could not be opened -- Program terminating" << endl; return -1; } // If the output file was opened, display confirmation else cout << "File " << output_filename << " was successfully opened" << endl; //cout << text.length() << endl; //TESTING //for(int i = 0; i < text.length(); i++) while(1) { // (b) file_input.read(¤t_char, 1); char_count++; if(file_input.eof()) break; //cout << current_char << " - "; //TESTING dump of chars from file if(CHAR_ENDS_SENTENCE[current_char] == 0) { if((!(current_char == ' ' && current_sentence == "")) && (current_char != '\n')) current_sentence += current_char; } // (c) else { current_sentence += current_char; // If sentence ends in a '.', make sure this is not just a part of a title, before // writing the sentence to file and beginning a new sentence if(current_char == '.') { if(!TitleDetected(current_sentence)) { file_output << current_sentence << '\n'; sentence_count++; current_sentence = ""; } } else { file_output << current_sentence << '\n'; sentence_count++; current_sentence = ""; } } } // I am done with the files, so they need to be closed file_input.close(); file_output.close(); return sentence_count; } /************************************************************ ---------------------WriteResultsToFile---------------------- - This function's primary purpose is to use the data that has been generated and record it in a text file - This text file will then be read by the visualization program (using PHP, HTML, and JavaScript) - The function also needs to order the words by the # of occurrances, so that the top TOP_WORDS can be recorded. TOP_WORDS is a global constant that can be set above. (a) The amount of words in the hash is found and stored in TOTAL_WORD_COUNT. (b) Two arrays of size TOTAL_WORD_COUNT are created. One holds the word's text, and the other holds the # of occurrances. These arrays are populated by fetching data from the Hash. Also initialized are the sentence indices pointer and an int that will store the sentence occurrance count. The pointer will be used to hold the array of sentence indices where a word occurs. The int will be used to track how long this array is. (b) The data from the two arrays will be printed into the text file, in the following format: - TOP_WORDS word texts, one per line - The sentence indices in which each of the top words appears, one line for each of the top words. Sentence line #s are seperated by ',' (e.g., 1,4,16). Total of TOP_WORDS lines. - The sentences, one per line ************************************************************/ int WriteResultsToFile(Hash * words, int sentences_count) { // (a) // Gets the total amount of words in the words hash const int TOTAL_WORD_COUNT = words->GetTotalWords(); // If there are less unique words in the hash than we are trying to store in the file if(TOTAL_WORD_COUNT < TOP_WORDS) { cout << "ERROR: Less words in hash than TOP_WORDS collection amount. Exiting function" << endl; return -1; } //cout << "Total Words: " << TOTAL_WORD_COUNT << endl; // (b) string * word_text = new string[TOTAL_WORD_COUNT]; int * word_occurrances = new int[TOTAL_WORD_COUNT]; int * sentence_indices = 0; int sentence_occurrance_count = 0; string sentence = ""; ifstream sentences_input; ofstream file_output; string sentences_filename = "sentences.txt"; string output_filename = "sample_shared.txt"; words->GetWordData(word_text, word_occurrances); //cout << "HERE" << endl; // Test dump of retrieved word data /*cout << "WORD DATA" << endl; for(int i = 0; i < TOTAL_WORD_COUNT; i++) cout << "Text: " << word_text[i] << ", Count: " << word_occurrances[i] << endl;*/ // Sends the two arrays to be sorted by occurrance count // Both must be sent in order to remain parallel QuickSort(word_occurrances, word_text, 0, TOTAL_WORD_COUNT-1); /*cout << "WORD DATA" << endl; for(int i = 0; i < TOTAL_WORD_COUNT; i++) cout << "Text: " << word_text[i] << ", Count: " << word_occurrances[i] << endl;*/ sentences_input.open("sentences.txt"); if(!sentences_input.is_open()) { cout << "ERROR: Sentences file could not be opened -- Program terminating" << endl; return -1; } else cout << "File " << sentences_filename << " was successfully opened" << endl; file_output.open(output_filename); // If the file was not opened, display error message and return if(!file_output.is_open()) { cout << "ERROR: Output file could not be opened -- Program terminating" << endl; return -1; } // If the file was opened, display confirmation else { cout << "File " << output_filename << " was successfully opened" << endl; // Writes the top TOP_WORDS most used words to the file - one per line for(int j = 0; j < TOP_WORDS; j++) file_output << word_text[j] << "\n"; // Writes the sentence-table indices of each occurrance of each word - one word per line for(int k = 0; k < TOP_WORDS; k++) { sentence_indices = words->FindWordByText(word_text[k])->GetSentenceIndices(); sentence_occurrance_count = words->FindWordByText(word_text[k])->GetAppearancesCount(); // Prints all but the last sentence index for(int z = 0; z < (sentence_occurrance_count - 1); z++) file_output << sentence_indices[z] << ","; // Prints the final sentence index, with no trailing comma file_output << sentence_indices[sentence_occurrance_count - 1] << "\n"; } // Write the sentences to the file for(int q = 0; q < sentences_count; q++) { getline(sentences_input, sentence); file_output << sentence << endl; } sentences_input.close(); file_output.close(); return 0; } } /************************************************************ ------------------------RemoveSuffix------------------------- This function will remove common suffixes from words, in an effort to obtain the root word. It is hoped that the root word may then match up to other uses of the root word in the words table. (a) Use a for loop, so that each suffix can be compared to the word (b) Based on the length of the suffix, find the last n characters of the word (c) Compare the suffix to the last n letters of the word. If they are equal, then the word has the suffix. The suffix should be removed, and the loop should break (d) If the suffix was not found in the word, move on to the next suffix. If none of the suffixes are found in the word, the word will be returned as is. ************************************************************/ string RemoveSuffix(string word) { // Obtain the length of the word int word_length = word.length(); string end_of_word = ""; string suffix = ""; int suffix_length = 0; // TESTING DUMP of word before modification //cout << "Word before modification: " << word << endl; // (a) for (int i = 0; i < SUFFIX_COUNT; i++) { suffix = SUFFIXES[i]; suffix_length = suffix.length(); // Reset end_of_word end_of_word = ""; // Test dump of suffix data //cout << "SUFFIX: " << suffix << ", LENGTH: " << suffix_length << endl; // We must ensure that the word is longer than the suffix being tested to avoid violating string bounds if (word_length > suffix_length) { // (b) for (int k = (word_length - suffix_length); k < word_length; k++) end_of_word += word[k]; // Test dumps of word to suffix comparison //cout << "END OF WORD: " << end_of_word << ", SUFFIX: " << suffix << endl; //cout << "END == SUFFIX? " << (end_of_word == suffix) << endl; // (c) if (end_of_word == suffix) { word.erase(word_length - suffix_length, suffix_length); break; } } } // Test dump of word after modification //cout << "Word after modification: " << word << endl; return word; } /************************************************************ -----------------------LowercaseString----------------------- This is a small function that will convert all chars of a string to lowercase. This helps with string comparisons. ************************************************************/ void LowercaseString(string & text) { for(int i = 0; i < text.length(); i++) text[i] = tolower(text[i]); } /************************************************************ ------------------------IsIgnoredWord------------------------ This is a simple bool function that will tell us whether a given word is one of the words that we have chosen to ignore. We have chosen to ignore some words because they do not provide much insight all by themselves. These words include conjunctions, etc. ************************************************************/ bool IsIgnoredWord(string word) { for (int i = 0; i < IGNORED_WORDS_COUNT; i++) { if (word == IGNORED_WORDS[i]) return true; } return false; } /************************************************************ -------------------------GetFileData------------------------- The purpose of GetFileData is to retrieve an estimate of how many words are contained in the input text file. This is done by using the stat function along with an estimation of the average length of an English- Language word. According to Wolfram Alpha, the average English- Language word is 5.1 characters long. I will be using 5, since all I need is an estimate. By using the stat function, I can see the amount of bytes in the text file, and since each char takes up 1 byte, I can estimate the amount of words contained in the file. Though Wolfram Alpha says that the average word is 5.1 chars long, I will use 6, to try to account for spaces. ************************************************************/ int GetFileData() { char * filename = "sample_in.txt"; int file_size; int approx_word_count; struct stat buffer; // Call the stat fcn to retrieve file data stat(filename, &buffer); // Find the file size (in bytes) from the file data file_size = buffer.st_size; // Divide by 6 for approx. word count approx_word_count = file_size / 6; cout << "File size: " << file_size << " bytes" << endl; cout << "Approximate word count: " << approx_word_count << endl; return approx_word_count; } /************************************************************ ----------------------FindNearestPrime----------------------- This function will find the first prime number >= the given int. The purpose of this is to find a hash table size that will accomodate all words in the file while avoiding clustering. ************************************************************/ int FindNearestPrime(int beginning_value) { int val = beginning_value; // Infinite while - should be broken by return statement while(1) { if(IsPrime(val)) return val; else val++; } } /************************************************************ ---------------------------IsPrime--------------------------- Determines whether the given int is a prime number. Does this by checking whether val has factors other than val and 1 ************************************************************/ bool IsPrime(int val) { // It is only necessary to go to (val/2) - 1, and all possibilities // will have been covered. If 2 is not a factor of val, then val/2 // is not a factor either for(int i = 2; i < val/2; i++) { if((val % i) == 0) return false; } return true; } /************************************************************ ------------------------TitleDetected------------------------ TitleDetected hopes to resolve a small problem that occurs when a title such as Mr., Mrs., Dr., St., etc., is come across during sentence parsing. Formerly, the program would believe that the '.' signified the end of a sentence, which would result in sentence fragments being stored in the sentences file. This function will check to see if any of the common title abbreviations are detected in the previous 4 or 4 characters of the string. If one of them is, true will be returned, and the string will continue to be built. If none are detected, false will be returned, and we will know that the string is to be terminated. Though the titles are only 3 or 4 chars long (including the '.', it is necessary to extend the scope of the comparison one extra character, to ensure that the preceding character is a space, so that we do not interpret the end of words as being titles. For example, if some word ended in dr (perhaps a name?), we would not want a sentence ending in that word to not be terminated because the end of the word resembled a title. In the case that a word is shorter than 5 characters, we will need to use special cases. For a sentence of size 4, we will check for length 4 titles, without the preceding space, and length 3 titles with the space. The same goes for sentences of length 3, but only checking length 3 titles, without the space. Any sentences shorter than 3 chars cannot have one of these titles, and false will be returned. All chars will be transformed to lower case to weed out any typing inconsistencies. ************************************************************/ bool TitleDetected(string sentence) { string last_three_chars = ""; string last_four_chars = ""; string last_five_chars = ""; // Stores the length of the sentence. This is 1 greater than the index of the last char in the sentence int sentence_length = sentence.length(); // Includes the preceding space on both gathered sets of chars if(sentence_length >= 5) { // sentence_length - 4 is the beginning of the last 4 chars of the sentence for(int i = sentence_length - 5; i < sentence_length; i++) { last_five_chars += tolower(sentence[i]); // We don't want the 4th char from the end, so we will only take chars closer than the initial i if(i > sentence_length - 5) last_four_chars += tolower(sentence[i]); } //TESTING DUMPS /*cout << "LAST 5 CHARS: " << last_five_chars << endl; cout << "LAST 4 CHARS: " << last_four_chars << endl;*/ // Compare last 5 chars to common 4 char long titles + space: " Mrs.", " Rev." if(last_five_chars == " mrs." || last_five_chars == " rev.") return true; // Compare last 4 chars to common 3 char long titles + space: " Mr.", " Ms.", " Dr.", " Fr.", " St." else if(last_four_chars == " mr." || last_four_chars == " ms." || last_four_chars == " dr." || last_four_chars == " fr." || last_four_chars == " st.") return true; // If the end of the sentence didn't match up with any titles, then the sentence must have been intentionally ended. else return false; } // Includes the preceding space only on the length 3 set of chars else if(sentence_length == 4) { // sentence_length - 4 is the beginning of the last 4 chars of the sentence for(int i = sentence_length - 4; i < sentence_length; i++) { last_four_chars += tolower(sentence[i]); } //TESTING DUMPS /*cout << "LAST 4 CHARS: " << last_four_chars << endl; cout << "LAST 3 CHARS: " << last_three_chars << endl;*/ // Compare last 4 chars to common 4 char long titles (no space): "Mrs.", "Rev." if(last_four_chars == "mrs." || last_four_chars == "rev.") return true; // With the way that the SplitIntoSentences function works, it is not possible for // a 3 character long title to appear at the start of a sentence with a preceding space, for // the space would not be added to an already blank sentence. Because of this, we do not // need to check for 3 char long titles in this case // Compare last 4 chars to common 3 char long titles + space: " Mr.", " Ms.", " Dr.", " Fr.", " St." /*else if(last_four_chars == " mr." || last_four_chars == " ms." || last_four_chars == " dr." || last_four_chars == " fr." || last_four_chars == " st.") return true;*/ // If the end of the sentence didn't match up with any titles, then the sentence must have been intentionally ended. else return false; } // Sentence of length 3, ensure it isn't something like "Dr. Soandso ...", cutting off on "Dr." // Sentences beginning with a 3 char title else if(sentence_length == 3) { // sentence_length - 4 is the beginning of the last 4 chars of the sentence for(int i = sentence_length - 3; i < sentence_length; i++) { last_three_chars += tolower(sentence[i]); } //TESTING DUMPS /*cout << "LAST 3 CHARS: " << last_three_chars << endl;*/ // Compare last 3 chars to common 3 char long titles (no space): "Mr.", "Ms.", "Dr.", "Fr.", "St." if(last_three_chars == "mr." || last_three_chars == "ms." || last_three_chars == "dr." || last_three_chars == "fr." || last_three_chars == "st.") return true; // If the end of the sentence didn't match up with any titles, then the sentence must have been intentionally ended. else return false; } // No titles are short enough to fit into this two character sentence else return false; } /************************************************************ ---------------------QuickSort and Split--------------------- Both of these functions were taken from Dr. McVey's CS220 Lab #8, from spring 2012. I have made slight modifications to cause the functions to sort two parallel arrays - one for text and one for count. These modifications include: - Changing both functions to accept another array - Changing variable names to discern between occurrances count and text arrays - Adding a second pivot to track the text at the index of the occurrance count pivot - Swapping contents of both arrays to maintain parallel nature - Changing sort from ascending to descending I have chosen to use QuickSort because of its O(nlgn) runtime, much more time-friendly than the O(n^2) runtime of bubble or insertion sort. This could be important as I begin to work with larger text files. ************************************************************/ void QuickSort(int Count[], string Text[], int first, int last) { int position; if (first < last) { position = Split(Count, Text, first, last); QuickSort(Count, Text, first, position-1); QuickSort(Count, Text, position+1, last); } } int Split(int Count[], string Text[], int first, int last) { int count_pivot = Count[first]; string text_pivot = Text[first]; int left = first, right = last; while (left < right) { while (count_pivot > Count[right]) right--; while (left < right && Count[left] >= count_pivot) left++; if (left < right) { // Whenever two cells in the Count array swap, the corresponding cells // in the Text array must also, in order to keep them parallel swap(Count[left], Count[right]); swap(Text[left], Text[right]); } } int position = right; Count[first] = Count[position]; Text[first] = Text[position]; Count[position] = count_pivot; Text[position] = text_pivot; return position; }