9 #ifndef LIBTPC_TPCINDEXREADER_H 10 #define LIBTPC_TPCINDEXREADER_H 14 #include <lucene++/LuceneHeaders.h> 16 #include "CASManager.h" 17 #include "DataStructures.h" 23 static const std::string INDEX_ROOT_LOCATION(
"/usr/local/textpresso/luceneindex/");
24 static const std::string CORPUS_COUNTER_FILENAME(
"cc.cfg");
25 static const std::string DOCUMENT_INDEXNAME(
"fulltext");
26 static const std::string SENTENCE_INDEXNAME(
"sentence");
27 static const std::string DOCUMENT_INDEXNAME_CS(
"fulltext_cs");
28 static const std::string SENTENCE_INDEXNAME_CS(
"sentence_cs");
30 static const int MAX_HITS(1000000);
31 static const int FIELD_CACHE_MIN_HITS(30000);
33 static const int MAX_NUM_SENTENCES_IN_QUERY(200);
34 static const int MAX_NUM_DOCIDS_IN_QUERY(200);
36 static const std::set<std::string> INDEX_TYPES{DOCUMENT_INDEXNAME, SENTENCE_INDEXNAME, DOCUMENT_INDEXNAME_CS,
37 SENTENCE_INDEXNAME_CS};
38 static const std::string SUBINDEX_NAME =
"subindex";
39 static const std::set<std::string> DOCUMENTS_FIELDS_DETAILED{
"accession_compressed",
"title_compressed",
40 "author_compressed",
"journal_compressed",
"year",
41 "abstract_compressed",
"filepath",
43 "fulltext_compressed",
"type_compressed",
44 "fulltext_cat_compressed"};
45 static const std::set<std::string> SENTENCE_FIELDS_DETAILED{
"sentence_id",
"begin",
"end",
46 "sentence_compressed",
"sentence_cat_compressed"};
57 std::string new_index_flag;
58 std::string index_descriptor;
64 explicit tpc_exception(
char const*
const message)
throw(): std::runtime_error(message) { }
65 virtual char const* what()
const throw() {
return std::exception::what(); }
82 explicit IndexManager(
const std::string& index_path,
bool read_only =
true,
bool external =
false):
83 index_dir(index_path),
88 externalIndexManager() { };
93 readers_map = other.readers_map;
94 index_dir = other.index_dir;
95 readonly = other.readonly;
96 external = other.external;
97 corpus_doc_counter = other.corpus_doc_counter;
98 externalIndexManager = other.externalIndexManager;
101 readers_map = other.readers_map;
102 index_dir = other.index_dir;
103 readonly = other.readonly;
104 external = other.external;
105 corpus_doc_counter = other.corpus_doc_counter;
106 externalIndexManager = other.externalIndexManager;
109 readers_map(std::move(other.readers_map)),
110 readonly(other.readonly),
111 external(other.external),
112 index_dir(std::move(other.index_dir)),
113 corpus_doc_counter(std::move(other.corpus_doc_counter)),
114 externalIndexManager(std::move(other.externalIndexManager)) {}
116 readers_map = std::move(other.readers_map);
117 index_dir = std::move(other.index_dir);
118 readonly = other.readonly;
119 external = other.external;
120 corpus_doc_counter = std::move(other.corpus_doc_counter);
121 externalIndexManager = std::move(other.externalIndexManager);
125 for (
auto &it : readers_map) {
134 static std::vector<std::string> get_available_corpora();
140 std::vector<std::string> get_additional_corpora();
148 int get_num_articles_in_corpus(
const std::string& corpus,
bool external =
false);
171 const std::set<std::string> &doc_ids = {},
190 bool include_sentences_details =
true,
191 std::set<std::string> include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
192 std::set<std::string> include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
193 const std::set<std::string> &exclude_doc_fields = {},
194 const std::set<std::string> &exclude_match_sentences_fields = {});
214 std::vector<DocumentDetails> get_documents_details(
const std::vector<DocumentSummary> &doc_summaries,
216 bool include_sentences_details =
true,
217 std::set<std::string> include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
218 std::set<std::string> include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
219 const std::set<std::string> &exclude_doc_fields = {},
220 const std::set<std::string> &exclude_match_sentences_fields = {});
223 static bool document_score_gt(
const Document &a,
const Document &b) {
return a.score > b.score; }
225 if (a.year != b.year)
return a.year > b.year;
226 return a.score > b.score;
230 return a.score > b.score;
238 void create_index_from_existing_cas_dir(
const std::string &input_cas_dir,
239 const std::set<std::string>& file_list = {},
240 int max_num_papers_per_subindex = 50000);
248 void add_file_to_index(
const std::string& file_path,
int max_num_papers_per_subindex = 50000);
255 void remove_file_from_index(
const std::string& identifier);
260 void calculate_and_save_corpus_counter();
265 void save_all_doc_ids_for_sentences_to_db();
270 void save_all_years_for_documents_to_db();
276 bool has_external_index();
282 void set_external_index(std::string external_idx_path);
287 void remove_external_index();
293 std::vector<std::string> get_external_corpora();
303 Lucene::Collection<Lucene::IndexReaderPtr> get_subreaders(QueryType type,
bool case_sensitive =
false);
313 SearchResults read_documents_summaries(
const Lucene::Collection<Lucene::ScoreDocPtr> &matches_collection,
314 bool sort_by_year =
false);
324 SearchResults read_sentences_summaries(
const Lucene::Collection<Lucene::ScoreDocPtr> &matches_collection,
325 bool sort_by_year =
false);
339 void update_sentences_details_for_document(
const DocumentSummary &doc_summary,
341 Lucene::QueryParserPtr sent_parser,
342 Lucene::SearcherPtr searcher,
343 Lucene::FieldSelectorPtr fsel,
344 const std::set<Lucene::String> &fields,
345 bool use_lucene_internal_ids,
346 Lucene::MultiReaderPtr sent_reader);
348 static std::set<Lucene::String> compose_field_set(
const std::set<std::string> &include_fields,
349 const std::set<std::string> &exclude_fields,
350 const std::set<std::string> &required_fields = {});
352 void update_document_details(
DocumentDetails &doc_details, Lucene::String field,
353 Lucene::DocumentPtr doc_ptr);
355 std::vector<DocumentDetails> read_documents_details(
const std::vector<DocumentSummary> &doc_summaries,
356 Lucene::QueryParserPtr doc_parser,
357 Lucene::SearcherPtr searcher,
358 Lucene::FieldSelectorPtr fsel,
359 const std::set<Lucene::String> &fields,
360 bool use_lucene_internal_ids,
361 Lucene::MultiReaderPtr doc_reader);
368 static TmpConf write_tmp_conf_files(
const std::string &index_path);
374 static void create_subindex_dir_structure(
const std::string &index_path);
384 int add_cas_file_to_index(
const char *file_path, std::string index_descriptor, std::string tempDir,
395 bool process_single_file(
const std::string &filepath,
bool &first_paper,
const TmpConf &tmp_conf,
396 bool update_db =
false);
398 std::string remove_document_from_index(std::string identifier,
bool case_sensitive);
399 void remove_sentences_for_document(
const std::string& doc_id,
bool case_sensitive);
401 void add_doc_and_sentences_to_bdb(std::string identifier);
403 void save_corpus_counter();
405 void update_corpus_counter();
410 void load_corpus_counter();
411 int get_num_docs_in_corpus_from_index(
const std::string& corpus);
413 std::map<std::string, Lucene::IndexReaderPtr> readers_map;
414 std::string index_dir;
417 std::map<std::string, int> corpus_doc_counter;
418 std::shared_ptr<IndexManager> externalIndexManager;
424 #endif //LIBTPC_TPCINDEXREADER_H results generated by a search
Definition: DataStructures.h:167
data structure that contains summary information related to a document as the result of a search ...
Definition: DataStructures.h:91
data structure that contains summary information related to a sentence as the result of a search ...
Definition: DataStructures.h:44
Definition: IndexManager.h:62
data structure that contains detailed information related to a document as the result of a search ...
Definition: DataStructures.h:109
data structure that represents information about temporary configuration files of an index ...
Definition: IndexManager.h:56
Definition: CASManager.h:16
generic information of a document
Definition: DataStructures.h:77
IndexManager(const std::string &index_path, bool read_only=true, bool external=false)
Definition: IndexManager.h:82
Definition: IndexManager.h:71
Definition: DataStructures.h:126