-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapreduce.cpp
More file actions
121 lines (116 loc) · 3.63 KB
/
Copy pathmapreduce.cpp
File metadata and controls
121 lines (116 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <cstdio>
#include <vector>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <unordered_map>
const int MAXBLOCK = 2e3; // max blocks in
const int BASE = 97; // used as hash value
const int MAXOCCRC = 100; // the number of url we want to get at last
struct Data
{
std::string url_name; // name of the url
int url_count; // occurrences of the url
Data(){}
Data(std::string name, int count):url_name(name), url_count(count){}
bool operator < (const Data &rhs)const
{
if(url_count != rhs.url_count)return url_count > rhs.url_count;
else return url_name < rhs.url_name;
}
};
void Map(std::ifstream &ifs, std::string output_dir)
{
// @Map: divide different urls into different files
// @param:
// @ifs: stringstream of the input file
// @output_dir: output directory
// @ret: void
std::ofstream *ofs = new std::ofstream[MAXBLOCK];
for(int i = 0; i < MAXBLOCK; i++){
std::string output_file = output_dir+std::to_string(i)+".txt";
ofs[i].open(output_file.c_str(), std::ios::out|std::ios::trunc);
if(!ofs[i].is_open()){
printf("Cannot open file %s\n", output_file.c_str());
exit(-1);
}
}
std::string s;
while(ifs >> s){
int hash = 0;
for(auto c: s){
hash = (hash*BASE+c)%MAXBLOCK;
}
ofs[hash] << s << std::endl;
}
for(int i = 0; i < MAXBLOCK; i++){
ofs[i].close();
}
delete[] ofs;
}
void Reduce(std::string output_dir)
{
// Reduce: sort all the urls in different files, and select the top 100 urls from each file to get the final result
// @param
// @output_dir: output directory
// @ret: void
std::string map_dir = output_dir+"/tmp/";
std::vector<Data> url_tot;
for(int i = 0; i < MAXBLOCK; i++){
std::string filename = map_dir+std::to_string(i)+".txt"; //filename of each blocks
std::ifstream ifs(filename, std::ios::in);
std::string s;
std::unordered_map<std::string, int> url_num; //count the ocurrencies of each urls
while(ifs >> s){
url_num[s]++;
}
std::vector<Data> url_file;
for(auto p: url_num){
url_file.push_back(Data(p.first, p.second));
}
sort(url_file.begin(), url_file.end()); // get the top 100 urls that appear most in this block
int num = 0;
for(auto p: url_file){
if(num >= MAXOCCRC)break;
else num++;
url_tot.push_back(p);
}
ifs.close();
}
sort(url_tot.begin(), url_tot.end()); // get the top 100 urls that appear most in all blocks
std::string output_file = output_dir+"/result.txt";
std::ofstream ofs(output_file.c_str(), std::ios::out|std::ios::trunc);
if(!ofs.is_open()){
printf("Cannot open file %s\n", output_file.c_str());
exit(-1);
}
int num = 0;
for(auto data: url_tot){
if(num >= MAXOCCRC)break;
else num++;
ofs << data.url_name << " " << data.url_count << std::endl;
}
ofs.close();
}
int main(int argc, char* argv[])
{
if(argc != 3){
printf("Usage: %s input_file output_directory\n", argv[0]);
exit(-1);
}
std::ifstream ifs(argv[1], std::ios::in);
if(!ifs.is_open()){
printf("Cannot open file %s\n", argv[1]);
exit(-1);
}
std::string map_dir = std::string(argv[2])+"/tmp/";
std::string cmd = "mkdir -p "+map_dir;
if(system(cmd.c_str())){
printf("Cannot create directory\n");
exit(-1);
}
Map(ifs, map_dir);
ifs.close();
Reduce(std::string(argv[2]));
return 0;
}