9 #ifndef LIBTPC_TPCCASMANAGER_H 10 #define LIBTPC_TPCCASMANAGER_H 14 #include "uima/xmiwriter.hpp" 20 static const std::string PDF2TPCAS_DESCRIPTOR(
"/usr/local/uima_descriptors/TpTokenizer.xml");
21 static const std::string XML2TPCAS_DESCRIPTOR(
"/usr/local/uima_descriptors/TxTokenizer.xml");
22 static const std::string TPCAS1_2_TPCAS2_DESCRIPTOR(
"/usr/local/uima-descriptors/TpLexiconAnnotatorFromPg.xml");
24 static const std::vector<std::pair<std::string, std::string>> PMCOA_CAT_REGEX{
25 {
"PMCOA Biology",
".*[Bb]io.*"}, {
"PMCOA Neuroscience",
".*[Nn]euro.*"}, {
"PMCOA Oncology",
".*([Cc]anc|[Oo]nc).*"},
26 {
"PMCOA Methodology",
".*[Mm]ethod.*"}, {
"PMCOA Medicine",
".*[Mm]edic.*"}, {
"PMCOA Virology",
".*[Vv]ir(us|ol).*"},
27 {
"PMCOA Genetics",
".*[Gg]enet.*"}, {
"PMCOA Animal",
".*[Aa]nimal.*"}, {
"PMCOA Clinical",
".*[Cc]linic.*"},
28 {
"PMCOA Genomics",
".*[Gg]enom.*"}, {
"PMCOA Disease",
".*[Dd]i(seas|abet).*"},
29 {
"PMCOA Agriculture",
".*[Aa]gricult.*"}, {
"PMCOA Physiology",
".*[Pp]hysiol.*"}, {
"PMCOA Psychology",
".*[Pp]sych(ol|iat).*"},
30 {
"PMCOA Crystallography",
".*[Cc]rystal.*"}, {
"PMCOA Chemistry",
".*[Cc]hemi.*"}, {
"PMCOA Health",
".*[Hh]ealth.*"}, {
"PMCOA Cardiology",
".*([Cc]ardi|[Hh]eart).*"},
31 {
"PMCOA Pharmacology",
".*[Pp]harm.*"}, {
"PMCOA Nutrition",
".*[Nn]utri.*"}, {
"PMCOA Immunology",
".*[Ii]mmuno.*"}, {
"PMCOA Pediatrics",
".*[Pp]a?ediatri.*"},
32 {
"PMCOA Review",
".*[Rr]eview.*"}, {
"PMCOA Protein",
".*[Pp]rotein.*"}, {
"PMCOA D. melanogaster",
".*(Drosphila( melanogaster)?|[Ff]ruit [Ff]ly|D\. melanogaster).*"}, {
"PMCOA C. elegans",
".*(Caenorhabditis( elegans)?|C\. elegans).*"},
33 {
"PMCOA A. thaliana",
".*(Arabidopsis( thaliana)?|A\. thaliana).*"}, {
"PMCOA M. musculus",
".*(Mus( musculus)?|M\. musculus|[Mm]usulus|[Mm]murine|[Mm]ouse|[Mm]ice).*"}, {
"PMCOA D. rerio",
".*(Danio rerio|D\. rerio|[Zz]ebrafish).*"}, {
"PMCOA S. cerevisiae",
".*(Saccharomyces( cerevisiae)?|S\. cerevisiae|[Bb]udding [Yy]east).*"},
34 {
"PMCOA S. pombe",
".*(Schizosaccharomyces( pombe)?|S\. pombe|([Ff]ission) [Yy]east).*"}, {
"PMCOA D. discoideum",
".*(Dictyostelium( discoideum)?|D\. discoideum|[Ss]lime [Mm]old).*"}, {
"PMCOA R. norvegicus",
".*(Rattus norvegicus|R\. norvegicus|Norway brown rat).*"}, {
"PMCOA R. rattus",
".*(Rattus rattus|R\. rattus|[Bb]lack rat).*"},
35 {
"PMCOA C. intestinalis",
".*(Ciona intestinalis|C\. intestinalis|[Ss]ea [Ss]quirt).*"}, {
"PMCOA X. laevis",
".*(Xenopus laevis|X\. laevis|African clawed frog).*"}, {
"PMCOA X. tropicalis",
".*(Xenopus tropicalis|X\. tropicalis|Western clawed frog).*"}, {
"PMCOA E. coli",
".*(Escherichia coli|E\. coli).*"},
36 {
"PMCOA B. subtilis",
".*(Bacillus subtilis|B\. subtilis).*"}};
38 static const std::string PMCOA_UNCLASSIFIED(
"PMCOA Unclassified");
39 static const std::string CELEGANS(
"C. elegans");
40 static const std::string CELEGANS_SUP(
"C. elegans Supplementals");
62 std::string accession;
80 static void convert_raw_file_to_cas1(
const std::string &file_path, FileType type,
81 const std::string &out_dir,
bool use_parent_dir_as_outname =
false);
89 static int convert_cas1_to_cas2(
const std::string& file_path,
const std::string& out_dir);
96 static BibInfo get_bib_info_from_xml_text(
const std::string& xml_text);
103 static std::vector<std::string> classify_article_into_corpora_from_bib_file(
const BibInfo& bib_info);
107 static void writeXmi(uima::CAS &outCas,
int num, std::string outfn);
112 #endif //LIBTPC_TPCCASMANAGER_H data structure that represents bib information of a cas file
Definition: CASManager.h:60
Definition: CASManager.h:16
Definition: CASManager.h:72