-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPinyinTest.cpp
More file actions
115 lines (96 loc) · 4.21 KB
/
PinyinTest.cpp
File metadata and controls
115 lines (96 loc) · 4.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/*
本文件只是简单的拼音测试用例,不是核心代码库之一
你如果想要使用搜索功能,你应该用TreeSearcher.h获取拼音匹配支持
如果你只需要拼音获取,那么本项目的PinIn也是非常合适的
项目统一用标准的std::string和std::string_view处理字符串,包括utf8字符串,所以你要保证你输入的应该是utf8字符串
所以这是一个天然支持utf8且只支持utf8的库,原始的Java PinIn库有个不可忽视的缺陷,就是不支持utf16不可表达的字
*/
#include <iostream>
#include <fstream>
#include <chrono>
#include <array>
#include "PinIn4Cpp/TreeSearcher.h"
using high_time_point = std::chrono::high_resolution_clock::time_point;
static high_time_point GetTimePoint() {
return std::chrono::high_resolution_clock::now();
}
static double GetTimeMS(high_time_point start, high_time_point end) {//微秒
std::chrono::duration<double> result = end - start;
return result.count() * 1000;
}
static void Pause() {
#ifdef _WIN32
system("pause");
#else
std::cout << "Press Enter to continue";
std::cin.get();
#endif
}
constexpr int TreeLoopInsertCount = 1;
constexpr int SearcherLoopCount = 1;
int main() {
#ifdef _WIN32
system("chcp 65001");//编码切换,windows平台的cmd命令
#endif
std::fstream file("../../../../test_data/small.txt");//数据读取
std::string line;
std::vector<std::string> FileCache;
while (std::getline(file, line)) {
FileCache.push_back(line);
}
file.close();
//PinIn的构造函数参数为拼音数据的文件路径,请使用https://github.com/mozillazg/pinyin-data中提供的,当然本项目也放有pinyin.txt,你可以直接使用
//说起来这个数据源是原本的约三倍大小(
//不过格式方面不一样,所以不方便比较
//TreeSearcher的第二个参数除了智能指针,其实都是PinIn类的构造参数
Pause();
high_time_point now = GetTimePoint();
std::shared_ptr<PinInCpp::PinIn> pininptr = std::make_shared<PinInCpp::PinIn>("../../../../test_data/pinyin.txt");
high_time_point end = GetTimePoint();
std::cout << GetTimeMS(now, end) << "ms\n";//计算获取耗时并打印,单位毫秒
Pause();
//pininptr->SetCharCache(true); //默认开启
PinInCpp::PinIn::Config cfg = pininptr->config();
cfg.fZh2Z = true;
cfg.fSh2S = true;
cfg.fCh2C = true;
cfg.fAng2An = true;
cfg.fIng2In = true;
cfg.fEng2En = true;
cfg.fU2V = true;
cfg.fFirstChar = true;//新增的首字母模糊
cfg.commit();
double timeCount = 0;
std::unique_ptr<PinInCpp::TreeSearcher> tree;
for (int i = 0; i < TreeLoopInsertCount; i++) {
tree = std::make_unique<PinInCpp::TreeSearcher>(PinInCpp::Logic::CONTAIN, pininptr);
for (const auto& v : FileCache) {
high_time_point now = GetTimePoint();
tree->put(v);
high_time_point end = GetTimePoint();
timeCount += GetTimeMS(now, end);
}
}
std::cout << timeCount / (double)TreeLoopInsertCount << "ms\n";
//pininptr->SetCharCache(false);//手动关闭可以释放缓存,不过搜索时也可能会利用缓存,会导致一定程度的性能下降
//插入耗时,比Java的快了,目前提供了缓存支持,主要原因还是在utf8字符串处理之类的问题上,当然内存占用也是如此(更大),毕竟utf8比utf16浪费内存
//目前已利用FourCC技术,将单UTF8字符高效的打包成uint32_t,利用缓冲区技术快速解包回去,实现单字符key的高效存储,避免字符串哈希
//而且为了保证内存池的utf8字符串O1的随机访问,把每个utf8字符都定长为4了,必不可少但是也耗费更多内存了
//内存占用的问题部分还来源size_t类型,因为64位下是八字节大的,基本上是翻倍了
while (true) {//死循环,你可以随便搜索测试集的内容用于测试
std::cout << "input:";
std::getline(std::cin, line);
double timeCount = 0;
std::vector<std::string> vec;
for (int i = 0; i < SearcherLoopCount; i++) {
high_time_point now = GetTimePoint();
vec = tree->ExecuteSearch(line);
high_time_point end = GetTimePoint();
timeCount += GetTimeMS(now, end);
}
for (const auto& v : vec) {
std::cout << v << '\n';
}
std::cout << timeCount / (double)SearcherLoopCount << "ms\n";//计算获取耗时并打印,单位毫秒
}
}