-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSearchEngine.h
177 lines (164 loc) · 4.82 KB
/
SearchEngine.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#ifndef SEARCH_ENGINE_H
#define SEARCH_ENGINE_H
#include <string>
#include <fstream>
#include <map>
#include <boost/multi_index/composite_key.hpp>
#include <boost/multi_index/identity.hpp>
#include <boost/multi_index/member.hpp>
#include <boost/multi_index/ordered_index.hpp>
#include <boost/multi_index_container.hpp>
#include <boost/serialization/string.hpp>
#include <boost/python.hpp>
#include <boost/python/dict.hpp>
#include "cppjieba/Jieba.hpp"
using namespace std;
using namespace boost;
using namespace boost::multi_index;
using namespace boost::serialization;
using namespace boost::python;
using namespace cppjieba;
class SearchEngine {
public:
SearchEngine(string dict_root, string file = "");
SearchEngine(const SearchEngine& sg);
SearchEngine& operator=(const SearchEngine& sg);
virtual ~SearchEngine() = default;
dict query(string question, unsigned int count = 10);
bool loadFromTxt(string file);
bool load(string file);
bool save(string file);
friend class boost::serialization::access;
template<class Archive>
void serialize(Archive & ar, const unsigned int version){
ar & qa_list_;
ar & term_list_;
ar & term_frequency_list_;
ar & doc_frequency_list_;
ar & next_query_index_;
ar & next_term_index_;
ar & average_question_length_;
}
public:
struct QA {
unsigned int query_index_;
string query_;
string answer_;
};
struct Term {
unsigned int term_index_;
string term_;
};
struct TermFrequency {
unsigned int query_index_;
unsigned int term_index_;
unsigned int term_frequency_;
};
struct DocFrequency {
unsigned int term_index_;
unsigned int doc_frequency_;
};
struct IndexByQueryId {};
struct IndexByQuery {};
struct IndexByTermId {};
struct IndexByTerm {};
struct IndexByQueryIdAndTermId {};
using QAList = multi_index_container<
QA,
indexed_by<
ordered_unique<
boost::multi_index::tag<IndexByQueryId>,
member<QA, unsigned int, &QA::query_index_>
>, // index by query index
ordered_non_unique<
boost::multi_index::tag<IndexByQuery>,
member<QA, string, &QA::query_>
> // index by query string
>
>;
using TermList = multi_index_container<
Term,
indexed_by<
ordered_unique<
boost::multi_index::tag<IndexByTermId>,
member<Term, unsigned int, &Term::term_index_>
>, // index by term index
ordered_unique<
boost::multi_index::tag<IndexByTerm>,
member<Term, string, &Term::term_>
> // index by term string
>
>;
using TermFrequencyList = multi_index_container<
TermFrequency,
indexed_by<
ordered_non_unique<
boost::multi_index::tag<IndexByQueryId>,
member<TermFrequency, unsigned int, &TermFrequency::query_index_>
>, // index by query index
ordered_unique<
boost::multi_index::tag<IndexByQueryIdAndTermId>,
composite_key<
TermFrequency,
member<TermFrequency, unsigned int, &TermFrequency::query_index_>,
member<TermFrequency, unsigned int, &TermFrequency::term_index_>
>
> // index by query index and term index together
>
>;
using DocFrequencyList = multi_index_container<
DocFrequency,
indexed_by<
ordered_unique<
boost::multi_index::tag<IndexByTermId>,
member<DocFrequency, unsigned int, &DocFrequency::term_index_>
> // index by term index
>
>;
private:
float bm25(string question, unsigned int query_index);
Jieba tokenizer_;
QAList qa_list_;
TermList term_list_;
TermFrequencyList term_frequency_list_;
DocFrequencyList doc_frequency_list_;
unsigned int next_query_index_;
unsigned int next_term_index_;
unsigned int average_question_length_;
};
namespace boost {
namespace serialization {
template<class Archive>
void serialize(Archive & ar, SearchEngine::QA & qa, const unsigned int version) {
ar & qa.query_index_;
ar & qa.query_;
ar & qa.answer_;
}
template<class Archive>
void serialize(Archive & ar, SearchEngine::Term & term, const unsigned int version) {
ar & term.term_index_;
ar & term.term_;
}
template<class Archive>
void serialize(Archive & ar, SearchEngine::TermFrequency & term_frequency, const unsigned int version) {
ar & term_frequency.query_index_;
ar & term_frequency.term_index_;
ar & term_frequency.term_frequency_;
}
template<class Archive>
void serialize(Archive & ar, SearchEngine::DocFrequency & doc_frequency, const unsigned int version) {
ar & doc_frequency.term_index_;
ar & doc_frequency.doc_frequency_;
}
}
}
BOOST_PYTHON_MODULE(search_engine)
{
class_<SearchEngine>("SearchEngine", init<string, optional<string> >())
.def("query", &SearchEngine::query)
.def("loadFromTxt", &SearchEngine::loadFromTxt)
.def("load", &SearchEngine::load)
.def("save", &SearchEngine::save)
;
}
#endif