/* ILLUSTRATING TEXT BY TAG CLOUDS - Parsing the Data File - preprocessor.cpp ---------------------------------------------------------------------------------------------------------------- Author: Emma E. Alberts Instructor: Bonita McVey & Dave Pankratz Date: 5/6/2019 Purpose: create a program that reads a data file and constructs a tag cloud ----------------------------------------------------------------------------------------------------------------*/ #include #include #include #include #include "ctype.h" #include "tagNode.h" using namespace std; int getIndex(char); int main() { ifstream fin; string resp; cout << "Enter file name: " << endl; cin >> resp; fin.open(resp); ofstream output; output.open("output.txt"); // make sure file is opened if (!fin.is_open()) { cout << "Unable to open data file ... bye" << endl; return 0; } // go through character by character char ch; bool lastSpace = false; fin >> noskipws; cout << "Reading data file and first step" << endl; while (!fin.eof()) { fin >> ch; //cout << ch << endl; if (ch == ' ') { if (!lastSpace) { output << '\n'; lastSpace = true; } } else if (isalnum(ch) != 0 || ch == '\'') { // is alphanumeric or is an apostrophe if (isdigit(ch) == 0 && ch != '\'') { // if it is not a digit char l = tolower(ch); output << l; lastSpace = false; } } } fin.close(); output.close(); // open output file again // remove any common words based on file ifstream common_words; common_words.open("common_words.txt"); ifstream reopen_output; reopen_output.open("output.txt"); ofstream final_output; final_output.open("final_output.txt"); string word, com_word; bool isCommon = false; cout << "Removing common words" << endl; while (!reopen_output.eof()) { isCommon = false; getline(reopen_output, word); while (!common_words.eof()) { getline(common_words, com_word); if (word == com_word) isCommon = true; } if (isCommon == false) final_output << word << endl; // reset common_words back to the beginning of the file common_words.clear(); common_words.seekg(0, ios::beg); } final_output.close(); common_words.close(); // sort the file alphabetically to make the counting easier // use an array of linked lists - 0 = a, 1 = b, 2 = c, ..., 26 = ' // all words starting with a go in 0, then do a sort on each linked list ifstream list_to_sort; list_to_sort.open("final_output.txt"); node * alphabetical_sort[27]; string curWord; char firstLetter; int index; // set all pointers to 0 initially for (int i = 0; i < 27; i++) alphabetical_sort[i] = NULL; cout << "Sorting alphabetically" << endl; while (!list_to_sort.eof()) { getline(list_to_sort, curWord); firstLetter = curWord[0]; index = getIndex(firstLetter); if (index != -1) { addWord(curWord, alphabetical_sort, index); } } list_to_sort.close(); cout << "Sorting each index alphabetically" << endl; // sort each index of the array and the remove duplicates while counting frequency node * p; for (int i = 0; i < 27; i++) { p = alphabetical_sort[i]; MergeSort(p); removeDuplicates(p); } srand(time(0)); bool addIt; ofstream finished_file; finished_file.open("final_result.txt"); while (!isEmpty(alphabetical_sort)) { index = (rand() % 27); cout << "index " << index << endl; addIt = addNodeToFile(alphabetical_sort[index]); if (addIt) { finished_file << alphabetical_sort[index]->word << " " << alphabetical_sort[index]->frequency << endl; deleteNode(alphabetical_sort[index]); } } finished_file.close(); cout << "DONE" << endl; system("pause"); return 0; } /* getIndex ---------------------------------------------------------------------------------------------------------------- Purpose: return the index in the array based upon the first letter Parameters: ch - the first char of the string Output: int - the index of the array in which it should be added ----------------------------------------------------------------------------------------------------------------*/ int getIndex(char ch) { switch (ch) { case 'a': return 0; break; case 'b': return 1; break; case 'c': return 2; break; case 'd': return 3; break; case 'e': return 4; break; case 'f': return 5; break; case 'g': return 6; break; case 'h': return 7; break; case 'i': return 8; break; case 'j': return 9; break; case 'k': return 10; break; case 'l': return 11; break; case 'm': return 12; break; case 'n': return 13; break; case 'o': return 14; break; case 'p': return 15; break; case 'q': return 16; break; case 'r': return 17; break; case 's': return 18; break; case 't': return 19; break; case 'u': return 20; break; case 'v': return 21; break; case 'w': return 22; break; case 'x': return 23; break; case 'y': return 24; break; case 'z': return 25; break; case '\'': return 26; break; } return -1; }