#include <IndexManager.h>

Public Member Functions
	IndexManager (const std::string &index_path, bool read_only=true, bool external=false)

	IndexManager (const IndexManager &other)

IndexManager &	operator= (const IndexManager &other)

	IndexManager (IndexManager &&other) noexcept

IndexManager &	operator= (IndexManager &&other) noexcept

void	close ()

std::vector< std::string >	get_additional_corpora ()

int	get_num_articles_in_corpus (const std::string &corpus, bool external=false)

SearchResults	search_documents (const Query &query, bool matches_only=false, const std::set< std::string > &doc_ids={}, const SearchResults &partialResults=SearchResults())
	search the Textpresso index for documents matching the provided Lucene query and return summary information with a list of results sorted by their match score More...

DocumentDetails	get_document_details (const DocumentSummary &doc_summary, bool include_sentences_details=true, std::set< std::string > include_doc_fields=DOCUMENTS_FIELDS_DETAILED, std::set< std::string > include_match_sentences_fields=SENTENCE_FIELDS_DETAILED, const std::set< std::string > &exclude_doc_fields={}, const std::set< std::string > &exclude_match_sentences_fields={})
	get detailed information about a document specified by a DocumentSummary object More...

std::vector< DocumentDetails >	get_documents_details (const std::vector< DocumentSummary > &doc_summaries, bool sort_by_year, bool include_sentences_details=true, std::set< std::string > include_doc_fields=DOCUMENTS_FIELDS_DETAILED, std::set< std::string > include_match_sentences_fields=SENTENCE_FIELDS_DETAILED, const std::set< std::string > &exclude_doc_fields={}, const std::set< std::string > &exclude_match_sentences_fields={})
	get detailed information for a set of documents specified by a list of DocumentSummary objects More...

void	create_index_from_existing_cas_dir (const std::string &input_cas_dir, const std::set< std::string > &file_list={}, int max_num_papers_per_subindex=50000)

void	add_file_to_index (const std::string &file_path, int max_num_papers_per_subindex=50000)

void	remove_file_from_index (const std::string &identifier)

void	calculate_and_save_corpus_counter ()

void	save_all_doc_ids_for_sentences_to_db ()

void	save_all_years_for_documents_to_db ()

bool	has_external_index ()

void	set_external_index (std::string external_idx_path)

void	remove_external_index ()

std::vector< std::string >	get_external_corpora ()

Static Public Member Functions
static std::vector< std::string >	get_available_corpora ()

static bool	document_score_gt (const Document &a, const Document &b)

static bool	document_year_score_gt (const Document &a, const Document &b)

static bool	sentence_greater_than (const SentenceSummary &a, const SentenceSummary &b)

Detailed Description

add, retrieve or remove documents from Textpresso index

Constructor & Destructor Documentation

tpc::index::IndexManager::IndexManager	(	const std::string &	index_path,
		bool	read_only = `true`,
		bool	external = `false`
	)

inlineexplicit

create a new index manager object

Parameters

index_path	the path to the index
read_only	whether the index should be opened in read-only mode
external	whether the index is external or standalone

Member Function Documentation

void IndexManager::add_file_to_index	(	const std::string &	file_path,
		int	max_num_papers_per_subindex = `50000`
	)

add a file to a textpresso index

Parameters

file_path	the path to a compressed cas file
literature	the literature of the file
max_num_papers_per_subindex	max number of papers per subindex

void IndexManager::calculate_and_save_corpus_counter ( )

update the document counters for the index and save them to file

void IndexManager::create_index_from_existing_cas_dir	(	const std::string &	input_cas_dir,
		const std::set< std::string > &	file_list = `{}`,
		int	max_num_papers_per_subindex = `50000`
	)

create a textpresso index from a set of cas files

Parameters

input_cas_dir	the directory containing the cas files to be added to the index
max_num_papers_per_subindex	max number of papers per subindex

std::vector< std::string > IndexManager::get_additional_corpora ( )

return the list of additional corpora

Returns: a vector of strings, representing the list of additional corpora in the index

std::vector< std::string > IndexManager::get_available_corpora ( )

static

return the list of indexed corpora

Returns: a vector of strings, representing the list of available corpora in the index

DocumentDetails IndexManager::get_document_details	(	const DocumentSummary &	doc_summary,
		bool	include_sentences_details = `true`,
		std::set< std::string >	include_doc_fields = `DOCUMENTS_FIELDS_DETAILED`,
		std::set< std::string >	include_match_sentences_fields = `SENTENCE_FIELDS_DETAILED`,
		const std::set< std::string > &	exclude_doc_fields = `{}`,
		const std::set< std::string > &	exclude_match_sentences_fields = `{}`
	)

get detailed information about a document specified by a DocumentSummary object

Parameters

doc_summary	the DocumentSummary object that identifies the document
include_sentences_details	whether to retrieve the details of the matching sentences specified in the DocumentSummary object
include_doc_fields	the list of fields to retrieve for the document. Retrieve all fields if not specified
include_match_sentences_fields	the list of fields to retrieve for the matching sentences specified in the DocumentSummary object
exclude_doc_fields	the list of fields to exclude for the document
exclude_match_sentences_fields	the list of fields to exclude for the matching sentences specified in the DocumentSummary object

Returns: the detailed information of the document

vector< DocumentDetails > IndexManager::get_documents_details	(	const std::vector< DocumentSummary > &	doc_summaries,
		bool	sort_by_year,
		bool	include_sentences_details = `true`,
		std::set< std::string >	include_doc_fields = `DOCUMENTS_FIELDS_DETAILED`,
		std::set< std::string >	include_match_sentences_fields = `SENTENCE_FIELDS_DETAILED`,
		const std::set< std::string > &	exclude_doc_fields = `{}`,
		const std::set< std::string > &	exclude_match_sentences_fields = `{}`
	)

get detailed information for a set of documents specified by a list of DocumentSummary objects

Parameters

doc_summaries	a list of DocumentSummary object that identifies the documents to be searched and, optionally, the list of sentences in the matching_sentences field of the document for which to retrieve detailed information
sort_by_year	whether to sort the results by year
include_sentences_details	whether to retrieve the details of the matching sentences specified in the DocumentSummary object
include_doc_fields	the list of fields to retrieve for the document. Retrieve all fields if not specified
include_match_sentences_fields	the list of fields to retrieve for the matching sentences specified in the DocumentSummary object
exclude_doc_fields	the list of fields to exclude for the document
exclude_match_sentences_fields	the list of fields to exclude for the matching sentences specified in the DocumentSummary object

Returns: the detailed information of the documents

vector< string > IndexManager::get_external_corpora ( )

get the list of additional corpora available from the external index

Returns: the list of additional corpora od the external index

int IndexManager::get_num_articles_in_corpus	(	const std::string &	corpus,
		bool	external = `false`
	)

return the number of articles indexed under a specific corpus

Parameters

corpus	the value of the corpus
external	whether to retrieve the number of articles per corpus from the external index

Returns: the numbe of articles indexed under the specified corpus

bool IndexManager::has_external_index ( )

whether the index has an external index attached

Returns: true if the index has an external index attached, false otherwise

void IndexManager::remove_external_index ( )

remove the external index

void IndexManager::remove_file_from_index ( const std::string & identifier )

remove a specific file from the index

Parameters

identifier the id of the file to remove, currently represented by the filepath field stored in lucene

void IndexManager::save_all_doc_ids_for_sentences_to_db ( )

create an external database for sentences containing their document ids

void IndexManager::save_all_years_for_documents_to_db ( )

create an external database for documents containing their year field

SearchResults IndexManager::search_documents	(	const Query &	query,
		bool	matches_only = `false`,
		const std::set< std::string > &	doc_ids = `{}`,
		const SearchResults &	partialResults = `SearchResults()`
	)

search the Textpresso index for documents matching the provided Lucene query and return summary information with a list of results sorted by their match score

The results returned by this method contain basic information regarding the documents matching the searches

Note that while the documents are sorted by score, their matched sentences, in case of sentence searches, are not sorted in order to obtain better performances

Parameters

query	a query object
matches_only	perform a partial search that returns a Lucene internal object representing the collection of matches. This object can be passed to a subsequent call to this method to continue the search and get the complete results. This is useful to get an initial estimate of the size of the complete search
doc_ids	limit the search to a set of document ids. This is useful for sentence queries to retrieve the sentence ids for a set of documents obtained by a previous search without ids
partialResults	the results of a previous partial search. The search will be completed with the sentence or document scores starting from the provided matching documents

Returns: the list of the documents matching the query sorted by their scores and encapsulated in a SearchResutl object

void IndexManager::set_external_index ( std::string external_idx_path )

add an external index to the main one

Parameters

external_idx_path the path to the external index

The documentation for this class was generated from the following files:

IndexManager.h
IndexManager.cpp

Public Member Functions

Static Public Member Functions

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation