libtpc  0.1
Textpressocentral core library
Public Member Functions | Static Public Member Functions | List of all members
tpc::index::IndexManager Class Reference

#include <IndexManager.h>

Public Member Functions

 IndexManager (const std::string &index_path, bool read_only=true, bool external=false)
 
 IndexManager (const IndexManager &other)
 
IndexManageroperator= (const IndexManager &other)
 
 IndexManager (IndexManager &&other) noexcept
 
IndexManageroperator= (IndexManager &&other) noexcept
 
void close ()
 
std::vector< std::string > get_additional_corpora ()
 
int get_num_articles_in_corpus (const std::string &corpus, bool external=false)
 
SearchResults search_documents (const Query &query, bool matches_only=false, const std::set< std::string > &doc_ids={}, const SearchResults &partialResults=SearchResults())
 search the Textpresso index for documents matching the provided Lucene query and return summary information with a list of results sorted by their match score More...
 
DocumentDetails get_document_details (const DocumentSummary &doc_summary, bool include_sentences_details=true, std::set< std::string > include_doc_fields=DOCUMENTS_FIELDS_DETAILED, std::set< std::string > include_match_sentences_fields=SENTENCE_FIELDS_DETAILED, const std::set< std::string > &exclude_doc_fields={}, const std::set< std::string > &exclude_match_sentences_fields={})
 get detailed information about a document specified by a DocumentSummary object More...
 
std::vector< DocumentDetailsget_documents_details (const std::vector< DocumentSummary > &doc_summaries, bool sort_by_year, bool include_sentences_details=true, std::set< std::string > include_doc_fields=DOCUMENTS_FIELDS_DETAILED, std::set< std::string > include_match_sentences_fields=SENTENCE_FIELDS_DETAILED, const std::set< std::string > &exclude_doc_fields={}, const std::set< std::string > &exclude_match_sentences_fields={})
 get detailed information for a set of documents specified by a list of DocumentSummary objects More...
 
void create_index_from_existing_cas_dir (const std::string &input_cas_dir, const std::set< std::string > &file_list={}, int max_num_papers_per_subindex=50000)
 
void add_file_to_index (const std::string &file_path, int max_num_papers_per_subindex=50000)
 
void remove_file_from_index (const std::string &identifier)
 
void calculate_and_save_corpus_counter ()
 
void save_all_doc_ids_for_sentences_to_db ()
 
void save_all_years_for_documents_to_db ()
 
bool has_external_index ()
 
void set_external_index (std::string external_idx_path)
 
void remove_external_index ()
 
std::vector< std::string > get_external_corpora ()
 

Static Public Member Functions

static std::vector< std::string > get_available_corpora ()
 
static bool document_score_gt (const Document &a, const Document &b)
 
static bool document_year_score_gt (const Document &a, const Document &b)
 
static bool sentence_greater_than (const SentenceSummary &a, const SentenceSummary &b)
 

Detailed Description

add, retrieve or remove documents from Textpresso index

Constructor & Destructor Documentation

tpc::index::IndexManager::IndexManager ( const std::string &  index_path,
bool  read_only = true,
bool  external = false 
)
inlineexplicit

create a new index manager object

Parameters
index_paththe path to the index
read_onlywhether the index should be opened in read-only mode
externalwhether the index is external or standalone

Member Function Documentation

void IndexManager::add_file_to_index ( const std::string &  file_path,
int  max_num_papers_per_subindex = 50000 
)

add a file to a textpresso index

Parameters
file_paththe path to a compressed cas file
literaturethe literature of the file
max_num_papers_per_subindexmax number of papers per subindex
void IndexManager::calculate_and_save_corpus_counter ( )

update the document counters for the index and save them to file

void IndexManager::create_index_from_existing_cas_dir ( const std::string &  input_cas_dir,
const std::set< std::string > &  file_list = {},
int  max_num_papers_per_subindex = 50000 
)

create a textpresso index from a set of cas files

Parameters
input_cas_dirthe directory containing the cas files to be added to the index
max_num_papers_per_subindexmax number of papers per subindex
std::vector< std::string > IndexManager::get_additional_corpora ( )

return the list of additional corpora

Returns
a vector of strings, representing the list of additional corpora in the index
std::vector< std::string > IndexManager::get_available_corpora ( )
static

return the list of indexed corpora

Returns
a vector of strings, representing the list of available corpora in the index
DocumentDetails IndexManager::get_document_details ( const DocumentSummary doc_summary,
bool  include_sentences_details = true,
std::set< std::string >  include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
std::set< std::string >  include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
const std::set< std::string > &  exclude_doc_fields = {},
const std::set< std::string > &  exclude_match_sentences_fields = {} 
)

get detailed information about a document specified by a DocumentSummary object

Parameters
doc_summarythe DocumentSummary object that identifies the document
include_sentences_detailswhether to retrieve the details of the matching sentences specified in the DocumentSummary object
include_doc_fieldsthe list of fields to retrieve for the document. Retrieve all fields if not specified
include_match_sentences_fieldsthe list of fields to retrieve for the matching sentences specified in the DocumentSummary object
exclude_doc_fieldsthe list of fields to exclude for the document
exclude_match_sentences_fieldsthe list of fields to exclude for the matching sentences specified in the DocumentSummary object
Returns
the detailed information of the document
vector< DocumentDetails > IndexManager::get_documents_details ( const std::vector< DocumentSummary > &  doc_summaries,
bool  sort_by_year,
bool  include_sentences_details = true,
std::set< std::string >  include_doc_fields = DOCUMENTS_FIELDS_DETAILED,
std::set< std::string >  include_match_sentences_fields = SENTENCE_FIELDS_DETAILED,
const std::set< std::string > &  exclude_doc_fields = {},
const std::set< std::string > &  exclude_match_sentences_fields = {} 
)

get detailed information for a set of documents specified by a list of DocumentSummary objects

Parameters
doc_summariesa list of DocumentSummary object that identifies the documents to be searched and, optionally, the list of sentences in the matching_sentences field of the document for which to retrieve detailed information
sort_by_yearwhether to sort the results by year
include_sentences_detailswhether to retrieve the details of the matching sentences specified in the DocumentSummary object
include_doc_fieldsthe list of fields to retrieve for the document. Retrieve all fields if not specified
include_match_sentences_fieldsthe list of fields to retrieve for the matching sentences specified in the DocumentSummary object
exclude_doc_fieldsthe list of fields to exclude for the document
exclude_match_sentences_fieldsthe list of fields to exclude for the matching sentences specified in the DocumentSummary object
Returns
the detailed information of the documents
vector< string > IndexManager::get_external_corpora ( )

get the list of additional corpora available from the external index

Returns
the list of additional corpora od the external index
int IndexManager::get_num_articles_in_corpus ( const std::string &  corpus,
bool  external = false 
)

return the number of articles indexed under a specific corpus

Parameters
corpusthe value of the corpus
externalwhether to retrieve the number of articles per corpus from the external index
Returns
the numbe of articles indexed under the specified corpus
bool IndexManager::has_external_index ( )

whether the index has an external index attached

Returns
true if the index has an external index attached, false otherwise
void IndexManager::remove_external_index ( )

remove the external index

void IndexManager::remove_file_from_index ( const std::string &  identifier)

remove a specific file from the index

Parameters
identifierthe id of the file to remove, currently represented by the filepath field stored in lucene
void IndexManager::save_all_doc_ids_for_sentences_to_db ( )

create an external database for sentences containing their document ids

void IndexManager::save_all_years_for_documents_to_db ( )

create an external database for documents containing their year field

SearchResults IndexManager::search_documents ( const Query query,
bool  matches_only = false,
const std::set< std::string > &  doc_ids = {},
const SearchResults partialResults = SearchResults() 
)

search the Textpresso index for documents matching the provided Lucene query and return summary information with a list of results sorted by their match score

The results returned by this method contain basic information regarding the documents matching the searches

Note that while the documents are sorted by score, their matched sentences, in case of sentence searches, are not sorted in order to obtain better performances

Parameters
querya query object
matches_onlyperform a partial search that returns a Lucene internal object representing the collection of matches. This object can be passed to a subsequent call to this method to continue the search and get the complete results. This is useful to get an initial estimate of the size of the complete search
doc_idslimit the search to a set of document ids. This is useful for sentence queries to retrieve the sentence ids for a set of documents obtained by a previous search without ids
partialResultsthe results of a previous partial search. The search will be completed with the sentence or document scores starting from the provided matching documents
Returns
the list of the documents matching the query sorted by their scores and encapsulated in a SearchResutl object
void IndexManager::set_external_index ( std::string  external_idx_path)

add an external index to the main one

Parameters
external_idx_paththe path to the external index

The documentation for this class was generated from the following files: