Skip to content
JohnDaws edited this page May 14, 2018 · 2 revisions

This example demonstrates some of the functionality available through the structure and templating functionality introduced in Baleen 2.4. It consists of three pipelines: one doing structure annotation, one doing record definition, and one doing record annotation. These can all be run in parallel.

There are a several files included as part of this example to demonstrate the functionality. Please download these into the correct directories (as shown below) before running the pipelines.

config.yml

pipelines:
- name: structure-annotation
  file: ./structure-pipeline.yml
- name: record-definition
  file: ./record-definition-pipeline.yml
- name: record-annotation
  file: ./record-annotation-pipeline.yml

structure-pipeline.yml

collectionreader:
  class: FolderReader
  folders:
  - ./input

annotators:
- language.OpenNLP
- regex.Area
- regex.BritishArmyUnits
- regex.Callsign
- regex.Date
- regex.DateTime
- regex.Dtg
- regex.Time
- regex.TimeQuantity
- regex.Distance
- regex.Email
- regex.FlightNumber
- regex.Frequency
- regex.IpV4
- regex.LatLon
- regex.Mgrs
- regex.Money
- regex.Nationality
- regex.Osgb
- regex.Postcode
- regex.TaskForce
- regex.Telephone
- regex.Url
- regex.Volume
- regex.Weight
- cleaners.MergeAdjacentQuantities
- grammatical.NPTitleEntity
- grammatical.QuantityNPEntity
- grammatical.TOLocationEntity
- cleaners.RemoveLowConfidenceEntities
- cleaners.RemoveNestedEntities
- cleaners.RemoveNestedLocations
- cleaners.NormalizeWhitespace
- cleaners.CleanPunctuation
- cleaners.AddTitleToPerson

consumers:
- class: StructuralHtml
  outputFolder: ./output_structure
  css: ../structural.css
- class: Html5
  outputFolder: ./output_entities
  css: ../html.css

record-annotation-pipeline.yml

collectionreader:
  class: FolderReader
  folders:
  - ./recordAnnotationInput

annotators:
- language.OpenNLP
- regex.Area
- regex.BritishArmyUnits
- regex.Callsign
- regex.Date
- regex.DateTime
- regex.Dtg
- regex.Time
- regex.TimeQuantity
- regex.Distance
- regex.Email
- regex.FlightNumber
- regex.Frequency
- regex.IpV4
- regex.LatLon
- regex.Mgrs
- regex.Money
- regex.Nationality
- regex.Osgb
- regex.Postcode
- regex.TaskForce
- regex.Telephone
- regex.Url
- regex.Volume
- regex.Weight
- cleaners.MergeAdjacentQuantities
- grammatical.NPTitleEntity
- grammatical.QuantityNPEntity
- grammatical.TOLocationEntity
- cleaners.RemoveLowConfidenceEntities
- cleaners.RemoveNestedEntities
- cleaners.RemoveNestedLocations
- cleaners.NormalizeWhitespace
- cleaners.CleanPunctuation
- cleaners.AddTitleToPerson
- class: templates.TemplateAnnotator
  recordDefinitionsDirectory: ./recordDefinitions


consumers:
- class: template.FileTemplateRecordConsumer
  outputDirectory: ./recordYaml
  recordDefinitionsDirectory: ./recordDefinitions
- print.Structures
- print.Selectors

record-definition-pipeline.yml

collectionreader:
  class: FolderReader
  folders:
  - ./recordDefinitionInput

annotators:
- language.OpenNLP
- templates.TemplateFieldDefinitionAnnotator
- templates.TemplateRecordDefinitionAnnotator

consumers:
- print.Structures
- print.Selectors
- class: template.TemplateRecordConfigurationCreatingConsumer
  outputDirectory: ./recordDefinitions
  types: 
    - Anchor
    - Aside
    - Caption
    - DefinitionDescription            
    - DefinitionItem                   
    - DefinitionList                   
    - Details
    - Document
    - SlideShow
    - SpreadSheet
    - TextDocument
    - Figure
    - Footer
    - Header
    - Heading
    - Link
    - ListItem
    - Ordered
    - Page
    - Sheet
    - Slide
    - Paragraph
    - Preformatted
    - Quotation
    - Section
    - Sentence
    - Style
    - Summary
    - Table
    - TableBody
    - TableCell
    - TableFooter
    - TableHeader
    - TableRow
    - Unordered