% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PDE.R
\name{PDE_pdfs2table_searchandfilter}
\alias{PDE_pdfs2table_searchandfilter}
\title{Extracting tables from a PDF (Portable Document Format) file}
\usage{
PDE_pdfs2table_searchandfilter(
  pdfs,
  out = ".",
  filter.words = "",
  ignore.case.fw = FALSE,
  filter.word.times = 20,
  table.heading.words = "",
  ignore.case.th = FALSE,
  search.words,
  ignore.case.sw = FALSE,
  eval.abbrevs = TRUE,
  out.table.format = ".csv (WINDOWS-1252)",
  dev = 20,
  write.table.locations = FALSE,
  exp.nondetc.tabs = FALSE,
  write.tab.doc.file = FALSE,
  delete = TRUE,
  verbose = TRUE
)
}
\arguments{
\item{pdfs}{String. A list of paths to the PDF files to be analyzed.}

\item{out}{String. Directory chosen to save analysis results in. Default:
\code{"."}.}

\item{filter.words}{List of strings. The list of filter words. If not
\code{NA} or \code{""} a hit will be counted every time a word from the list
is detected in the article. Regex rules apply (see also
\url{https://rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf}).
 Default: \code{""}.}

\item{ignore.case.fw}{Logical. Are the filter words case-sensitive (does
capitalization matter)? Default: \code{FALSE}.}

\item{filter.word.times}{Numeric. The minimum number of hits described for
\code{filter.words} for a paper to be further analyzed. Default: \code{20}.}

\item{table.heading.words}{List of strings. Different than standard (TABLE,
TAB or table plus number) headings to be detected. Regex rules apply (see
also
\url{https://rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf}).
 Default = \code{""}.}

\item{ignore.case.th}{Logical. Are the additional table headings (see
\code{table.heading.words}) case-sensitive (does capitalization matter)?
Default = \code{FALSE}.}

\item{search.words}{List of strings. List of search words. To extract all
tables from the PDF file leave \code{search.words = ""}. Regex rules apply 
(see also
\url{https://rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf}).}

\item{ignore.case.sw}{Logical. Are the search words case-sensitive (does
capitalization matter)? Default: \code{FALSE}.}

\item{eval.abbrevs}{Logical. Should abbreviations for the search words be
automatically detected and then replaced with the search word + "$*"?
Default: \code{TRUE}.}

\item{out.table.format}{String. Output file format. Either comma separated
file \code{.csv} or tab separated file \code{.tsv}. The encoding indicated
in parantheses should be selected according to the operational system 
exported tables are opened in, i.e., Windows: \code{"(WINDOWS-1252)"}; Mac: 
\code{(macintosh)}; Linux: \code{(UTF-8)}. Default: \code{".csv"} and 
encoding depending on the operational system.}

\item{dev}{Numeric. For a table the size of indention which would be
considered the same column. Default: \code{20}.}

\item{write.table.locations}{Logical. If \code{TRUE}, a separate file with the
headings of all tables, their relative location in the generated html and
txt files, as well as information if search words were found will be
generated. Default: \code{FALSE}.}

\item{exp.nondetc.tabs}{Logical. If \code{TRUE}, if a table was detected in a
PDF file but is an image or cannot be read, the page with the table with be
exported as a png. Default: \code{FALSE}.}

\item{write.tab.doc.file}{Logical. If \code{TRUE}, if search words are used
for table detection and no search words were found in the tables of a PDF 
file, a \strong{no.table.w.search.words}. Default: \code{FALSE}.}

\item{delete}{Logical. If \code{TRUE}, the intermediate \strong{txt},
\strong{keeplayouttxt} and \strong{html} copies of the PDF file will be 
deleted. Default: \code{TRUE}.}

\item{verbose}{Logical. Indicates whether messages will be printed in the console. Default: \code{TRUE}.}
}
\value{
If tables were extracted from the PDF file the function returns a list of
 following tables/items: 1) \strong{htmltablelines}, 2)
 \strong{txttablelines}, 3) \strong{keeplayouttxttablelines}, 4) \strong{id},
 5) \strong{out_msg}.
 The \strong{tablelines} are tables that provide the heading and position of
 the detected tables. The \strong{id} provide the name of the PDF file. The
 \strong{out_msg} includes all messages printed to the console or the suppressed
 messages if \code{verbose=FALSE}.
}
\description{
\code{PDE_pdfs2table_searchandfilter} extracts tables from a single PDF file
according to filter and search words and writes output in the corresponding
folder.
}
\examples{

## Running a simple analysis with filter and search words to extract tables
if(PDE_check_Xpdf_install() == TRUE){
 outputtables <- PDE_pdfs2table_searchandfilter(pdf = paste0(PDE_path(),
                                   "examples/Methotrexate/29973177_!.pdf"),
 out = paste0(PDE_path(),"examples/29973177_tables/"),
 filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
 ignore.case.fw = TRUE,
 search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
 ignore.case.sw = FALSE)
}

## Running an advanced analysis with filter and search words to
## extract tables and obtain documentation files
if(PDE_check_Xpdf_install() == TRUE){
 outputtables <- PDE_pdfs2table_searchandfilter(pdf = paste0(PDE_path(),
                                   "examples/Methotrexate/29973177_!.pdf"),
 out = paste0(PDE_path(),"examples/29973177_tables/"),
 dev = 20,
 filter.words = strsplit("cohort;case-control;group;study population;study participants", ";")[[1]],
 ignore.case.fw = TRUE,
 filter.word.times = 20,
 table.heading.words = "",
 ignore.case.th = FALSE,
 search.words = strsplit("(M|m)ethotrexate;(T|t)rexal;(R|r)heumatrex;(O|o)trexup", ";")[[1]],
 ignore.case.sw = FALSE,
 eval.abbrevs = TRUE,
 out.table.format = ".csv (WINDOWS-1252)",
 write.table.locations = TRUE,
 write.tab.doc.file = TRUE,
 exp.nondetc.tabs = TRUE,
 delete = TRUE)
}

}
\seealso{
\code{\link{PDE_extr_data_from_pdfs}}, \code{\link{PDE_pdfs2table}}
}
