-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsequential.cpp
More file actions
108 lines (98 loc) · 3.03 KB
/
sequential.cpp
File metadata and controls
108 lines (98 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#include <fstream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <iostream>
#include <vector>
#include <algorithm>
#include <omp.h>
using namespace std;
void process_word(string &w) {
// Remove punctuation and non-ascii chars at beginning
while (!w.empty()) {
signed char c = w.front();
if (c < 0 || ispunct(c)) {
w.erase(0, 1);
continue;
}
break;
}
// Remove punctuation and non-ascii chars at end
while (!w.empty()) {
signed char c = w.back();
if (c < 0 || ispunct(c)) {
w.pop_back();
continue;
}
break;
}
// Convert all letters to lowercase
for (char &ch : w) {
unsigned char c = static_cast<unsigned char>(ch);
if (isupper(c)) {
ch = tolower(c);
}
}
}
int main(int argc, char* argv[]) {
if (argc < 2) {
fprintf(stderr, "usage: %s <input_files>\n", argv[0]);
return 1;
}
vector<pair<string, size_t>> raw_tuples;
size_t file_word_count = 0;
double start, end;
start = omp_get_wtime();
// File reading step
size_t f_count = 1;
while (argv[f_count]) {
ifstream fin(argv[f_count]);
if (!fin) {
fprintf(stderr, "error: unable to open input file: %s\n", argv[f_count]);
return 1;
}
string word;
while (fin >> word) {
process_word(word);
// Map step
if (!word.empty()) { // avoid pushing empty strings
file_word_count++;
raw_tuples.push_back({word, 1});
}
}
f_count++;
}
// Shuffle step
unordered_map<string, vector<size_t>> buckets;
for (size_t i = 0; i < raw_tuples.size(); ++i) {
buckets[raw_tuples[i].first].push_back(raw_tuples[i].second);
}
double start_r = omp_get_wtime();
// Reduce step
vector<pair<string, size_t>> counts;
for (auto entry : buckets) {
size_t sum = 0;
for (size_t i = 0; i < entry.second.size(); ++i) {
sum += entry.second[i];
}
counts.push_back({entry.first, sum});
}
double start_p = omp_get_wtime();
// Sort in alphabetical order
sort(counts.begin(), counts.end(), [](const pair<string, int> &a, const pair<string, int> &b) {
return a.first < b.first;
});
// Writing step
ofstream fout("results_seq.txt");
fout << "Filename: " << argv[1] << ", total words: " << file_word_count << '\n';
for (size_t i = 0; i < counts.size(); ++i) {
fout << "[" << i << "] " << counts[i].first << ": " << counts[i].second << '\n';
}
end = omp_get_wtime();
// Use cerr to always print in terminal
cerr << "Sequential time: " << (end - start) * 1000 << " ms\n";
cerr << "\tFile Read & Map time: " << (start_r - start) * 1000 << " ms\n";
cerr << "\tReduce time: " << (start_p - start_r) * 1000 << "ms\n";
cerr << "\tSort & print time: " << (end - start_p) * 1000 << "ms\n";
return 0;
}