cs240/labs/lab10/word_count.c

280 lines
6.6 KiB
C
Raw Normal View History

2018-10-15 17:20:57 -04:00
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
//________________________SETTINGS__________________________
#define WORDLEN 100
#define AUTO_HIST_SCALE 120
//Maximum initial WordList size
int MAX_LIST_LEN = 25;
//Total distinct words encountered
int nWords = 0;
//Symbol for discrete unit of histogram
char symbolHistogram = '|';
//Scale applied to Histogram
//customScale > 0 acts as multiplier
//customScale <= 0 invokes Auto Scaling based on most frequent word
double customScale = 0;
//______________________________________________________________________
//Input file pointer
FILE * fp;
//________________________________STRUCT______________________________________
//TODO: Define structure called "WordCount"
//Also, define a pointer called "WordList" to this structure here (Array of structures)
//Then, malloc this defined structure in main
typedef struct WordCount {
char* word;
int count;
} WordCount;
WordCount** WordList;
//______________________________________________________________________
char buffer[WORDLEN];
int pos = 0;
//_________NEXTWORD: Given to students: Returns nextword in file_______
static char * nextWordInFile() {
char c;
while((c=fgetc(fp)) != EOF) {
if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
buffer[pos++] = c;
buffer[pos] = '\0';
}
else {
if(pos > 0) {
pos = 0;
return strdup(buffer);
}
}
}
return NULL;
}
//________________________Helper Functions______________________________
char * toLower(char * word) {
//Helper to convert a word to lower case
char * s = word;
while(*s) {
if(*s >= 'A' && *s <= 'Z') {
*s += ' ';
}
s++;
}
return word;
}
int getWordIndex(char * word) {
//Helper to get i-th word from array of structures
//Could additionally be used to check existence of a word in the array
for(int i = 0; i < nWords; i++) {
if(!strcmp(WordList[i]->word, word)) {
return i;
}
}
return -1;
}
void help() {
printf("SYNTAX: ./WordCount <filename> [ --sort-lex | --sort-num ] --histogram?\n");
}
void printStruct() {
//Helper to print your structure
//Traverse and print members with their count
/*
word1 word_count
.
.
.
wordN word_count
*/
for (int i = 0; i < nWords; i++) {
printf("%s %d\n", WordList[i]->word, WordList[i]->count);
}
}
//_________________________________________________________________________
//_________________________Task 1: WordCount_______________________________
/*
*Update <wordList> with every new word encountered
*With repeated instance of a word, only update <count>
*New entries: Realloc if greater than macro<MAX_LIST_LEN>
*/
void wordCount() {
//TODO: Your Implementation here
char* word = nextWordInFile();
while (word != NULL) {
toLower(word);
int i = getWordIndex(word);
if (i != -1)
WordList[i]->count++;
else {
if (nWords >= MAX_LIST_LEN) {
MAX_LIST_LEN *= 2;
WordList = (WordCount **) realloc(WordList, sizeof(WordCount) * MAX_LIST_LEN);
}
WordCount* newCount = malloc(sizeof(WordCount));
newCount->word = strdup(word);
free(word);
word = NULL;
newCount->count = 1;
WordList[nWords] = newCount;
nWords++;
}
word = nextWordInFile();
}
}
//________________________________________________________________
//_________________________Task 2: Flags (Optional)_______________________________
//Task 2 a) : Sorting Alphabetically (Ascending)
void sortLex() {
//TODO: Your Implementation here
int sortFlag;
for (int i = 1; i < nWords; i++) {
sortFlag = 0;
for (int j = 0; j < nWords - 1; j++) {
if (strcmp(WordList[j]->word, WordList[j + 1]->word) > 0) {
int tempCount = WordList[j]->count;
char* tempStr = WordList[j]->word;
WordList[j]->count = WordList[j + 1]->count;
WordList[j]->word = WordList[j + 1]->word;
WordList[j + 1]->count = tempCount;
WordList[j + 1]->word = tempStr;
sortFlag = 1;
}
}
if (sortFlag == 0)
return;
}
}
//Task 2 b) : Sorting by frequencies (Descending)
void sortNum() {
//TODO: Your Implementation here
int sortFlag;
for (int i = 1; i < nWords; i++) {
sortFlag = 0;
for (int j = 0; j < nWords - 1; j++) {
if (WordList[j]->count < WordList[j + 1]->count) {
int tempCount = WordList[j]->count;
char* tempStr = WordList[j]->word;
WordList[j]->count = WordList[j + 1]->count;
WordList[j]->word = WordList[j + 1]->word;
WordList[j + 1]->count = tempCount;
WordList[j + 1]->word = tempStr;
sortFlag = 1;
}
}
if (sortFlag == 0)
return;
}
}
//________________________________________________________________
//___________________Extra Credit: Histogram______________________
void plotHistogram(double scale, char symbol) {
int maxLen = 20;
int max = 0;
for (int i = 0; i < nWords; i++) {
if (WordList[i]->count > max) {
max = WordList[i]->count;
}
//if (strlen(WordList[i]->word) > maxLen) {
// maxLen = strlen(WordList[i]->word);
//}
}
if (scale <= 0) {
scale = (1 / ((double) max)) * AUTO_HIST_SCALE;
printf("HISTOGRAM WITH AUTO SCALING OF (CURR / MAX) * AUTO_HIST_SCALE: %d\n", AUTO_HIST_SCALE);
printf("USING HIGHEST FREQUENCY AS: %d\n", max);
}
for (int i = 0; i < nWords; i++) {
printf("%s", WordList[i]->word);
int len = strlen(WordList[i]->word);
int size = ceil((int) (((double) WordList[i]->count) * scale)) + 1; //(WordList[i]->count / max)
for (int k = 0; k < maxLen - len; k++) {
printf(" ");
}
for (int j = 0; j < size; j++) {
printf("|");
}
printf("\n");
}
}
//_____________________________________________________________________
int main(int argc, char **argv) {
//WordCount array
//Malloc your structure array here
// Defined above
WordList = (WordCount **) malloc(sizeof(WordCount) * MAX_LIST_LEN);
//Reads file with <filename> given by ARGV[1]
//nextWordInFile returns words from file pointed by FILE * fp
if (argc == 2 && strcmp(argv[1], "--help") == 0)
help();
else if(argc >= 2) {
fp = fopen(argv[1], "r");
if(!fp) return 0;
//Call your WordCount method and sorting methods HERE as needed by your flags
wordCount();
if (argc >= 3 && strcmp(argv[2], "--sort-num") == 0)
sortNum();
else if (argc >= 3 && strcmp(argv[2], "--sort-lex") == 0)
sortLex();
if ((argc >= 3 && strcmp(argv[2], "--histogram") == 0) || (argc >= 4 && strcmp(argv[3], "--histogram") == 0))
plotHistogram(customScale, symbolHistogram);
else
printStruct();
//TIP: Remember to change argc condition with additional flags. Current implementation only works for 2 arguments
}
else {
help();
}
//TODO: Use argc and argv to check for flags
return 0;
}