libtpc  0.1
Textpressocentral core library
CASManager.h
1 
9 #ifndef LIBTPC_TPCCASMANAGER_H
10 #define LIBTPC_TPCCASMANAGER_H
11 
12 #include <string>
13 #include <vector>
14 #include "uima/xmiwriter.hpp"
15 
16 namespace tpc {
17 
18  namespace cas {
19 
20  static const std::string PDF2TPCAS_DESCRIPTOR("/usr/local/uima_descriptors/TpTokenizer.xml");
21  static const std::string XML2TPCAS_DESCRIPTOR("/usr/local/uima_descriptors/TxTokenizer.xml");
22  static const std::string TPCAS1_2_TPCAS2_DESCRIPTOR("/usr/local/uima-descriptors/TpLexiconAnnotatorFromPg.xml");
23 
24  static const std::vector<std::pair<std::string, std::string>> PMCOA_CAT_REGEX{
25  {"PMCOA Biology", ".*[Bb]io.*"}, {"PMCOA Neuroscience", ".*[Nn]euro.*"}, {"PMCOA Oncology", ".*([Cc]anc|[Oo]nc).*"},
26  {"PMCOA Methodology", ".*[Mm]ethod.*"}, {"PMCOA Medicine", ".*[Mm]edic.*"}, {"PMCOA Virology", ".*[Vv]ir(us|ol).*"},
27  {"PMCOA Genetics", ".*[Gg]enet.*"}, {"PMCOA Animal", ".*[Aa]nimal.*"}, {"PMCOA Clinical", ".*[Cc]linic.*"},
28  {"PMCOA Genomics", ".*[Gg]enom.*"}, {"PMCOA Disease", ".*[Dd]i(seas|abet).*"},
29  {"PMCOA Agriculture", ".*[Aa]gricult.*"}, {"PMCOA Physiology", ".*[Pp]hysiol.*"}, {"PMCOA Psychology", ".*[Pp]sych(ol|iat).*"},
30  {"PMCOA Crystallography", ".*[Cc]rystal.*"}, {"PMCOA Chemistry", ".*[Cc]hemi.*"}, {"PMCOA Health", ".*[Hh]ealth.*"}, {"PMCOA Cardiology", ".*([Cc]ardi|[Hh]eart).*"},
31  {"PMCOA Pharmacology", ".*[Pp]harm.*"}, {"PMCOA Nutrition", ".*[Nn]utri.*"}, {"PMCOA Immunology", ".*[Ii]mmuno.*"}, {"PMCOA Pediatrics", ".*[Pp]a?ediatri.*"},
32  {"PMCOA Review", ".*[Rr]eview.*"}, {"PMCOA Protein", ".*[Pp]rotein.*"}, {"PMCOA D. melanogaster", ".*(Drosphila( melanogaster)?|[Ff]ruit [Ff]ly|D\. melanogaster).*"}, {"PMCOA C. elegans", ".*(Caenorhabditis( elegans)?|C\. elegans).*"},
33  {"PMCOA A. thaliana", ".*(Arabidopsis( thaliana)?|A\. thaliana).*"}, {"PMCOA M. musculus", ".*(Mus( musculus)?|M\. musculus|[Mm]usulus|[Mm]murine|[Mm]ouse|[Mm]ice).*"}, {"PMCOA D. rerio", ".*(Danio rerio|D\. rerio|[Zz]ebrafish).*"}, {"PMCOA S. cerevisiae", ".*(Saccharomyces( cerevisiae)?|S\. cerevisiae|[Bb]udding [Yy]east).*"},
34  {"PMCOA S. pombe", ".*(Schizosaccharomyces( pombe)?|S\. pombe|([Ff]ission) [Yy]east).*"}, {"PMCOA D. discoideum", ".*(Dictyostelium( discoideum)?|D\. discoideum|[Ss]lime [Mm]old).*"}, {"PMCOA R. norvegicus", ".*(Rattus norvegicus|R\. norvegicus|Norway brown rat).*"}, {"PMCOA R. rattus", ".*(Rattus rattus|R\. rattus|[Bb]lack rat).*"},
35  {"PMCOA C. intestinalis", ".*(Ciona intestinalis|C\. intestinalis|[Ss]ea [Ss]quirt).*"}, {"PMCOA X. laevis", ".*(Xenopus laevis|X\. laevis|African clawed frog).*"}, {"PMCOA X. tropicalis", ".*(Xenopus tropicalis|X\. tropicalis|Western clawed frog).*"}, {"PMCOA E. coli", ".*(Escherichia coli|E\. coli).*"},
36  {"PMCOA B. subtilis", ".*(Bacillus subtilis|B\. subtilis).*"}};
37 
38  static const std::string PMCOA_UNCLASSIFIED("PMCOA Unclassified");
39  static const std::string CELEGANS("C. elegans");
40  static const std::string CELEGANS_SUP("C. elegans Supplementals");
41 
42  enum class FileType {
43  pdf = 1, xml = 2
44  };
45 
60  struct BibInfo {
61  std::string author;
62  std::string accession;
63  std::string type;
64  std::string title;
65  std::string journal;
66  std::string citation;
67  std::string year;
68  std::string abstract;
69  std::string subject;
70  };
71 
72  class CASManager {
73  public:
80  static void convert_raw_file_to_cas1(const std::string &file_path, FileType type,
81  const std::string &out_dir, bool use_parent_dir_as_outname = false);
82 
89  static int convert_cas1_to_cas2(const std::string& file_path, const std::string& out_dir);
90 
96  static BibInfo get_bib_info_from_xml_text(const std::string& xml_text);
97 
103  static std::vector<std::string> classify_article_into_corpora_from_bib_file(const BibInfo& bib_info);
104 
105  private:
106 
107  static void writeXmi(uima::CAS &outCas, int num, std::string outfn);
108  };
109  }
110 }
111 
112 #endif //LIBTPC_TPCCASMANAGER_H
data structure that represents bib information of a cas file
Definition: CASManager.h:60
Definition: CASManager.h:16
Definition: CASManager.h:72