libtpc
0.1
Textpressocentral core library
|
#include <IndexManager.h>
Public Member Functions | |
IndexManager (const std::string &index_path, bool read_only=true, bool external=false) | |
IndexManager (const IndexManager &other) | |
IndexManager & | operator= (const IndexManager &other) |
IndexManager (IndexManager &&other) noexcept | |
IndexManager & | operator= (IndexManager &&other) noexcept |
void | close () |
std::vector< std::string > | get_additional_corpora () |
int | get_num_articles_in_corpus (const std::string &corpus, bool external=false) |
SearchResults | search_documents (const Query &query, bool matches_only=false, const std::set< std::string > &doc_ids={}, const SearchResults &partialResults=SearchResults()) |
search the Textpresso index for documents matching the provided Lucene query and return summary information with a list of results sorted by their match score More... | |
DocumentDetails | get_document_details (const DocumentSummary &doc_summary, bool include_sentences_details=true, std::set< std::string > include_doc_fields=DOCUMENTS_FIELDS_DETAILED, std::set< std::string > include_match_sentences_fields=SENTENCE_FIELDS_DETAILED, const std::set< std::string > &exclude_doc_fields={}, const std::set< std::string > &exclude_match_sentences_fields={}) |
get detailed information about a document specified by a DocumentSummary object More... | |
std::vector< DocumentDetails > | get_documents_details (const std::vector< DocumentSummary > &doc_summaries, bool sort_by_year, bool include_sentences_details=true, std::set< std::string > include_doc_fields=DOCUMENTS_FIELDS_DETAILED, std::set< std::string > include_match_sentences_fields=SENTENCE_FIELDS_DETAILED, const std::set< std::string > &exclude_doc_fields={}, const std::set< std::string > &exclude_match_sentences_fields={}) |
get detailed information for a set of documents specified by a list of DocumentSummary objects More... | |
void | create_index_from_existing_cas_dir (const std::string &input_cas_dir, const std::set< std::string > &file_list={}, int max_num_papers_per_subindex=50000) |
void | add_file_to_index (const std::string &file_path, int max_num_papers_per_subindex=50000) |
void | remove_file_from_index (const std::string &identifier) |
void | calculate_and_save_corpus_counter () |
void | save_all_doc_ids_for_sentences_to_db () |
void | save_all_years_for_documents_to_db () |
bool | has_external_index () |
void | set_external_index (std::string external_idx_path) |
void | remove_external_index () |
std::vector< std::string > | get_external_corpora () |
Static Public Member Functions | |
static std::vector< std::string > | get_available_corpora () |
static bool | document_score_gt (const Document &a, const Document &b) |
static bool | document_year_score_gt (const Document &a, const Document &b) |
static bool | sentence_greater_than (const SentenceSummary &a, const SentenceSummary &b) |
add, retrieve or remove documents from Textpresso index
|
inlineexplicit |
create a new index manager object
index_path | the path to the index |
read_only | whether the index should be opened in read-only mode |
external | whether the index is external or standalone |
void IndexManager::add_file_to_index | ( | const std::string & | file_path, |
int | max_num_papers_per_subindex = 50000 |
||
) |
add a file to a textpresso index
file_path | the path to a compressed cas file |
literature | the literature of the file |
max_num_papers_per_subindex | max number of papers per subindex |
void IndexManager::calculate_and_save_corpus_counter | ( | ) |
update the document counters for the index and save them to file
void IndexManager::create_index_from_existing_cas_dir | ( | const std::string & | input_cas_dir, |
const std::set< std::string > & | file_list = {} , |
||
int | max_num_papers_per_subindex = 50000 |
||
) |
create a textpresso index from a set of cas files
input_cas_dir | the directory containing the cas files to be added to the index |
max_num_papers_per_subindex | max number of papers per subindex |
std::vector< std::string > IndexManager::get_additional_corpora | ( | ) |
return the list of additional corpora
|
static |
return the list of indexed corpora
DocumentDetails IndexManager::get_document_details | ( | const DocumentSummary & | doc_summary, |
bool | include_sentences_details = true , |
||
std::set< std::string > | include_doc_fields = DOCUMENTS_FIELDS_DETAILED , |
||
std::set< std::string > | include_match_sentences_fields = SENTENCE_FIELDS_DETAILED , |
||
const std::set< std::string > & | exclude_doc_fields = {} , |
||
const std::set< std::string > & | exclude_match_sentences_fields = {} |
||
) |
get detailed information about a document specified by a DocumentSummary object
doc_summary | the DocumentSummary object that identifies the document |
include_sentences_details | whether to retrieve the details of the matching sentences specified in the DocumentSummary object |
include_doc_fields | the list of fields to retrieve for the document. Retrieve all fields if not specified |
include_match_sentences_fields | the list of fields to retrieve for the matching sentences specified in the DocumentSummary object |
exclude_doc_fields | the list of fields to exclude for the document |
exclude_match_sentences_fields | the list of fields to exclude for the matching sentences specified in the DocumentSummary object |
vector< DocumentDetails > IndexManager::get_documents_details | ( | const std::vector< DocumentSummary > & | doc_summaries, |
bool | sort_by_year, | ||
bool | include_sentences_details = true , |
||
std::set< std::string > | include_doc_fields = DOCUMENTS_FIELDS_DETAILED , |
||
std::set< std::string > | include_match_sentences_fields = SENTENCE_FIELDS_DETAILED , |
||
const std::set< std::string > & | exclude_doc_fields = {} , |
||
const std::set< std::string > & | exclude_match_sentences_fields = {} |
||
) |
get detailed information for a set of documents specified by a list of DocumentSummary objects
doc_summaries | a list of DocumentSummary object that identifies the documents to be searched and, optionally, the list of sentences in the matching_sentences field of the document for which to retrieve detailed information |
sort_by_year | whether to sort the results by year |
include_sentences_details | whether to retrieve the details of the matching sentences specified in the DocumentSummary object |
include_doc_fields | the list of fields to retrieve for the document. Retrieve all fields if not specified |
include_match_sentences_fields | the list of fields to retrieve for the matching sentences specified in the DocumentSummary object |
exclude_doc_fields | the list of fields to exclude for the document |
exclude_match_sentences_fields | the list of fields to exclude for the matching sentences specified in the DocumentSummary object |
vector< string > IndexManager::get_external_corpora | ( | ) |
get the list of additional corpora available from the external index
int IndexManager::get_num_articles_in_corpus | ( | const std::string & | corpus, |
bool | external = false |
||
) |
return the number of articles indexed under a specific corpus
corpus | the value of the corpus |
external | whether to retrieve the number of articles per corpus from the external index |
bool IndexManager::has_external_index | ( | ) |
whether the index has an external index attached
void IndexManager::remove_external_index | ( | ) |
remove the external index
void IndexManager::remove_file_from_index | ( | const std::string & | identifier | ) |
remove a specific file from the index
identifier | the id of the file to remove, currently represented by the filepath field stored in lucene |
void IndexManager::save_all_doc_ids_for_sentences_to_db | ( | ) |
create an external database for sentences containing their document ids
void IndexManager::save_all_years_for_documents_to_db | ( | ) |
create an external database for documents containing their year field
SearchResults IndexManager::search_documents | ( | const Query & | query, |
bool | matches_only = false , |
||
const std::set< std::string > & | doc_ids = {} , |
||
const SearchResults & | partialResults = SearchResults() |
||
) |
search the Textpresso index for documents matching the provided Lucene query and return summary information with a list of results sorted by their match score
The results returned by this method contain basic information regarding the documents matching the searches
Note that while the documents are sorted by score, their matched sentences, in case of sentence searches, are not sorted in order to obtain better performances
query | a query object |
matches_only | perform a partial search that returns a Lucene internal object representing the collection of matches. This object can be passed to a subsequent call to this method to continue the search and get the complete results. This is useful to get an initial estimate of the size of the complete search |
doc_ids | limit the search to a set of document ids. This is useful for sentence queries to retrieve the sentence ids for a set of documents obtained by a previous search without ids |
partialResults | the results of a previous partial search. The search will be completed with the sentence or document scores starting from the provided matching documents |
void IndexManager::set_external_index | ( | std::string | external_idx_path | ) |
add an external index to the main one
external_idx_path | the path to the external index |