libtpc  0.1
Textpressocentral core library
IndexManager.h
1 
9 #ifndef LIBTPC_TPCINDEXREADER_H
10 #define LIBTPC_TPCINDEXREADER_H
11 
12 #include <vector>
13 #include <string>
14 #include <lucene++/LuceneHeaders.h>
15 #include <cfloat>
16 #include "CASManager.h"
17 #include "DataStructures.h"
18 
19 namespace tpc {
20 
21  namespace index {
22 
23  static const std::string INDEX_ROOT_LOCATION("/usr/local/textpresso/luceneindex/");
24  static const std::string CORPUS_COUNTER_FILENAME("cc.cfg");
25  static const std::string DOCUMENT_INDEXNAME("fulltext");
26  static const std::string SENTENCE_INDEXNAME("sentence");
27  static const std::string DOCUMENT_INDEXNAME_CS("fulltext_cs");
28  static const std::string SENTENCE_INDEXNAME_CS("sentence_cs");
29 
30  static const int MAX_HITS(1000000);
31  static const int FIELD_CACHE_MIN_HITS(30000);
32 
33  static const int MAX_NUM_SENTENCES_IN_QUERY(200);
34  static const int MAX_NUM_DOCIDS_IN_QUERY(200);
35 
36  static const std::set<std::string> INDEX_TYPES{DOCUMENT_INDEXNAME, SENTENCE_INDEXNAME, DOCUMENT_INDEXNAME_CS,
37  SENTENCE_INDEXNAME_CS};
38  static const std::string SUBINDEX_NAME = "subindex";
39  static const std::set<std::string> DOCUMENTS_FIELDS_DETAILED{"accession_compressed", "title_compressed",
40  "author_compressed", "journal_compressed", "year",
41  "abstract_compressed", "filepath",
42  "corpus", "doc_id",
43  "fulltext_compressed", "type_compressed",
44  "fulltext_cat_compressed"};
45  static const std::set<std::string> SENTENCE_FIELDS_DETAILED{"sentence_id", "begin", "end",
46  "sentence_compressed", "sentence_cat_compressed"};
47 
56  struct TmpConf {
57  std::string new_index_flag;
58  std::string index_descriptor;
59  std::string tmp_dir;
60  };
61 
62  class tpc_exception : public std::runtime_error {
63  public:
64  explicit tpc_exception(char const* const message) throw(): std::runtime_error(message) { }
65  virtual char const* what() const throw() { return std::exception::what(); }
66  };
67 
71  class IndexManager {
72  public:
73 
74  IndexManager() = default;
75 
82  explicit IndexManager(const std::string& index_path, bool read_only = true, bool external = false):
83  index_dir(index_path),
84  readonly(read_only),
85  external(external),
86  readers_map(),
87  corpus_doc_counter(),
88  externalIndexManager() { };
89  ~IndexManager() {
90  close();
91  };
92  IndexManager(const IndexManager& other) {
93  readers_map = other.readers_map;
94  index_dir = other.index_dir;
95  readonly = other.readonly;
96  external = other.external;
97  corpus_doc_counter = other.corpus_doc_counter;
98  externalIndexManager = other.externalIndexManager;
99  };
100  IndexManager& operator=(const IndexManager& other) {
101  readers_map = other.readers_map;
102  index_dir = other.index_dir;
103  readonly = other.readonly;
104  external = other.external;
105  corpus_doc_counter = other.corpus_doc_counter;
106  externalIndexManager = other.externalIndexManager;
107  };
108  IndexManager(IndexManager&& other) noexcept :
109  readers_map(std::move(other.readers_map)),
110  readonly(other.readonly),
111  external(other.external),
112  index_dir(std::move(other.index_dir)),
113  corpus_doc_counter(std::move(other.corpus_doc_counter)),
114  externalIndexManager(std::move(other.externalIndexManager)) {}
115  IndexManager& operator=(IndexManager&& other) noexcept {
116  readers_map = std::move(other.readers_map);
117  index_dir = std::move(other.index_dir);
118  readonly = other.readonly;
119  external = other.external;
120  corpus_doc_counter = std::move(other.corpus_doc_counter);
121  externalIndexManager = std::move(other.externalIndexManager);
122  };
123 
124  void close() {
125  for (auto &it : readers_map) {
126  it.second->close();
127  }
128  }
129 
134  static std::vector<std::string> get_available_corpora();
135 
140  std::vector<std::string> get_additional_corpora();
141 
148  int get_num_articles_in_corpus(const std::string& corpus, bool external = false);
149 
170  SearchResults search_documents(const Query &query, bool matches_only = false,
171  const std::set<std::string> &doc_ids = {},
172  const SearchResults& partialResults = SearchResults());
173 
189  DocumentDetails get_document_details(const DocumentSummary &doc_summary,
190  bool include_sentences_details = true,
191  std::set<std::string> include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
192  std::set<std::string> include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
193  const std::set<std::string> &exclude_doc_fields = {},
194  const std::set<std::string> &exclude_match_sentences_fields = {});
195 
214  std::vector<DocumentDetails> get_documents_details(const std::vector<DocumentSummary> &doc_summaries,
215  bool sort_by_year,
216  bool include_sentences_details = true,
217  std::set<std::string> include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
218  std::set<std::string> include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
219  const std::set<std::string> &exclude_doc_fields = {},
220  const std::set<std::string> &exclude_match_sentences_fields = {});
221 
222  // comparators for reverse sorting of documents and sentence objects
223  static bool document_score_gt(const Document &a, const Document &b) { return a.score > b.score; }
224  static bool document_year_score_gt(const Document &a, const Document &b) {
225  if (a.year != b.year) return a.year > b.year;
226  return a.score > b.score;
227  }
228 
229  static bool sentence_greater_than(const SentenceSummary &a, const SentenceSummary &b) {
230  return a.score > b.score;
231  }
232 
238  void create_index_from_existing_cas_dir(const std::string &input_cas_dir,
239  const std::set<std::string>& file_list = {},
240  int max_num_papers_per_subindex = 50000);
241 
248  void add_file_to_index(const std::string& file_path, int max_num_papers_per_subindex = 50000);
249 
255  void remove_file_from_index(const std::string& identifier);
256 
260  void calculate_and_save_corpus_counter();
261 
265  void save_all_doc_ids_for_sentences_to_db();
266 
270  void save_all_years_for_documents_to_db();
271 
276  bool has_external_index();
277 
282  void set_external_index(std::string external_idx_path);
283 
287  void remove_external_index();
288 
293  std::vector<std::string> get_external_corpora();
294 
295  private:
296 
303  Lucene::Collection<Lucene::IndexReaderPtr> get_subreaders(QueryType type, bool case_sensitive = false);
304 
313  SearchResults read_documents_summaries(const Lucene::Collection<Lucene::ScoreDocPtr> &matches_collection,
314  bool sort_by_year = false);
315 
324  SearchResults read_sentences_summaries(const Lucene::Collection<Lucene::ScoreDocPtr> &matches_collection,
325  bool sort_by_year = false);
326 
339  void update_sentences_details_for_document(const DocumentSummary &doc_summary,
340  DocumentDetails &doc_details,
341  Lucene::QueryParserPtr sent_parser,
342  Lucene::SearcherPtr searcher,
343  Lucene::FieldSelectorPtr fsel,
344  const std::set<Lucene::String> &fields,
345  bool use_lucene_internal_ids,
346  Lucene::MultiReaderPtr sent_reader);
347 
348  static std::set<Lucene::String> compose_field_set(const std::set<std::string> &include_fields,
349  const std::set<std::string> &exclude_fields,
350  const std::set<std::string> &required_fields = {});
351 
352  void update_document_details(DocumentDetails &doc_details, Lucene::String field,
353  Lucene::DocumentPtr doc_ptr);
354 
355  std::vector<DocumentDetails> read_documents_details(const std::vector<DocumentSummary> &doc_summaries,
356  Lucene::QueryParserPtr doc_parser,
357  Lucene::SearcherPtr searcher,
358  Lucene::FieldSelectorPtr fsel,
359  const std::set<Lucene::String> &fields,
360  bool use_lucene_internal_ids,
361  Lucene::MultiReaderPtr doc_reader);
362 
368  static TmpConf write_tmp_conf_files(const std::string &index_path);
369 
374  static void create_subindex_dir_structure(const std::string &index_path);
375 
384  int add_cas_file_to_index(const char *file_path, std::string index_descriptor, std::string tempDir,
385  bool update_db);
386 
395  bool process_single_file(const std::string &filepath, bool &first_paper, const TmpConf &tmp_conf,
396  bool update_db = false);
397 
398  std::string remove_document_from_index(std::string identifier, bool case_sensitive);
399  void remove_sentences_for_document(const std::string& doc_id, bool case_sensitive);
400 
401  void add_doc_and_sentences_to_bdb(std::string identifier);
402 
403  void save_corpus_counter();
404 
405  void update_corpus_counter();
406 
410  void load_corpus_counter();
411  int get_num_docs_in_corpus_from_index(const std::string& corpus);
412 
413  std::map<std::string, Lucene::IndexReaderPtr> readers_map;
414  std::string index_dir;
415  bool readonly;
416  bool external;
417  std::map<std::string, int> corpus_doc_counter;
418  std::shared_ptr<IndexManager> externalIndexManager;
419  };
420  }
421 }
422 
423 
424 #endif //LIBTPC_TPCINDEXREADER_H
results generated by a search
Definition: DataStructures.h:167
data structure that contains summary information related to a document as the result of a search ...
Definition: DataStructures.h:91
data structure that contains summary information related to a sentence as the result of a search ...
Definition: DataStructures.h:44
Definition: IndexManager.h:62
data structure that contains detailed information related to a document as the result of a search ...
Definition: DataStructures.h:109
data structure that represents information about temporary configuration files of an index ...
Definition: IndexManager.h:56
Definition: CASManager.h:16
generic information of a document
Definition: DataStructures.h:77
IndexManager(const std::string &index_path, bool read_only=true, bool external=false)
Definition: IndexManager.h:82
Definition: IndexManager.h:71
Definition: DataStructures.h:126