PMC document samples

Download samples from ftp://ftp.ncbi.nlm.nih.gov/pub/pmc

Store the pmc at ~/datasets/PMC

Unzip files

$for a in `ls -R -1 ~/datasets/PMC/*/*/*/*.tar.gz`; do tar -xzvf $a; done

The extracted files will be at PMC directory

Format with correct indentation

$for a in `ls -R -1 ~/datasets/PMC/*/*.nxml`; do xmllint --format $a --output $a.xml; done

Test xpath

$xmllint --xpath '//contrib' ~/datasets/PMC/PMC162170/bpo_v5_p136_m55.nxml.xml

Configure FileBeat

###################### Filebeat Configuration Example #########################

# This file is an example configuration file highlighting only the most common

# options. The filebeat.reference.yml file from the same directory contains all the

# supported options with more comments. You can use it as a reference.

# You can find the full configuration reference here:

# https://www.elastic.co/guide/en/beats/filebeat/index.html

# For more available modules and options, please see the filebeat.reference.yml sample

# configuration file.

#=========================== Filebeat prospectors =============================

filebeat.prospectors:

# Each - is a prospector. Most options can be set at the prospector level, so

# you can use different prospectors for various configurations.

# Below are the prospector specific configurations.

- type: log

# Change to true to enable this prospector configuration.

enabled: true

# Paths that should be crawled and fetched. Glob based paths.

paths:

- /home/duy/datasets/PMC/*/*.xml

#- c:\programdata\elasticsearch\logs\*

# Exclude lines. A list of regular expressions to match. It drops the lines that are

# matching any regular expression from the list.

#exclude_lines: ['^DBG']

# Include lines. A list of regular expressions to match. It exports the lines that are

# matching any regular expression from the list.

#include_lines: ['^ERR', '^WARN']

# Exclude files. A list of regular expressions to match. Filebeat drops the files that

# are matching any regular expression from the list. By default, no files are dropped.

#exclude_files: ['.gz$']

# Optional additional fields. These fields can be freely picked

# to add additional information to the crawled log files for filtering

#fields:

# level: debug

# review: 1

### Multiline options

# Mutiline can be used for log messages spanning multiple lines. This is common

# for Java Stack Traces or C-Line Continuation

# The regexp Pattern that has to be matched. The example pattern matches all lines starting with a specific pattern

multiline.pattern: '^<article'

# Defines if the pattern set under pattern should be negated or not. Default is false.

multiline.negate: true

# Match can be set to "after" or "before". It is used to define if lines should be append to a pattern

# that was (not) matched before or after or as long as a pattern is not matched based on negate.

# Note: After is the equivalent to previous and before is the equivalent to to next in Logstash

multiline.match: after

#============================= Filebeat modules ===============================

filebeat.config.modules:

# Glob pattern for configuration loading

path: ${path.config}/modules.d/*.yml

# Set to true to enable config reloading

reload.enabled: false

# Period on which files under path should be checked for changes

#reload.period: 10s

#==================== Elasticsearch template setting ==========================

setup.template.settings:

index.number_of_shards: 3

#index.codec: best_compression

#_source.enabled: false

#================================ General =====================================

# The name of the shipper that publishes the network data. It can be used to group

# all the transactions sent by a single shipper in the web interface.

#name:

# The tags of the shipper are included in their own field with each

# transaction published.

#tags: ["service-X", "web-tier"]

# Optional fields that you can specify to add additional information to the

# output.

#fields:

# env: staging

#============================== Dashboards =====================================

# These settings control loading the sample dashboards to the Kibana index. Loading

# the dashboards is disabled by default and can be enabled either by setting the

# options here, or by using the `-setup` CLI flag or the `setup` command.

#setup.dashboards.enabled: false

# The URL from where to download the dashboards archive. By default this URL

# has a value which is computed based on the Beat name and version. For released

# versions, this URL points to the dashboard archive on the artifacts.elastic.co

# website.

#setup.dashboards.url:

#============================== Kibana =====================================

# Starting with Beats version 6.0.0, the dashboards are loaded via the Kibana API.

# This requires a Kibana endpoint configuration.

setup.kibana:

# Kibana Host

# Scheme and port can be left out and will be set to the default (http and 5601)

# In case you specify and additional path, the scheme is required: http://localhost:5601/path

# IPv6 addresses should always be defined as: https://[2001:db8::1]:5601

#host: "localhost:5601"

#============================= Elastic Cloud ==================================

# These settings simplify using filebeat with the Elastic Cloud (https://cloud.elastic.co/).

# The cloud.id setting overwrites the `output.elasticsearch.hosts` and

# `setup.kibana.host` options.

# You can find the `cloud.id` in the Elastic Cloud web UI.

#cloud.id:

# The cloud.auth setting overwrites the `output.elasticsearch.username` and

# `output.elasticsearch.password` settings. The format is `<user>:<pass>`.

#cloud.auth:

#================================ Outputs =====================================

# Configure what output to use when sending the data collected by the beat.

#-------------------------- Elasticsearch output ------------------------------

#output.elasticsearch:

# Array of hosts to connect to.

# hosts: ["localhost:9200"]

# Optional protocol and basic auth credentials.

#protocol: "https"

#username: "elastic"

#password: "changeme"

#----------------------------- Logstash output --------------------------------

output.logstash:

# The Logstash hosts

hosts: ["localhost:5044"]

# Optional SSL. By default is off.

# List of root certificates for HTTPS server verifications

#ssl.certificate_authorities: ["/etc/pki/root/ca.pem"]

# Certificate for SSL client authentication

#ssl.certificate: "/etc/pki/client/cert.pem"

# Client Certificate Key

#ssl.key: "/etc/pki/client/cert.key"

#================================ Logging =====================================

# Sets log level. The default log level is info.

# Available log levels are: error, warning, info, debug

#logging.level: debug

# At debug level, you can selectively enable logging only for some components.

# To enable all selectors use ["*"]. Examples of other selectors are "beat",

# "publish", "service".

#logging.selectors: ["*"]

#============================== Xpack Monitoring ===============================

# filebeat can export internal metrics to a central Elasticsearch monitoring

# cluster. This requires xpack monitoring to be enabled in Elasticsearch. The

# reporting is disabled by default.

# Set to true to enable the monitoring reporter.

#xpack.monitoring.enabled: false

# Uncomment to send the metrics to Elasticsearch. Most settings from the

# Elasticsearch output are accepted here as well. Any setting that is not set is

# automatically inherited from the Elasticsearch output configuration, so if you

# have the Elasticsearch output configured, you can simply uncomment the

# following line.

#xpack.monitoring.elasticsearch:

Configure Logstash

input {

beats {

port => "5044"

}

# The filter part of this file is commented out to indicate that it is

# optional.

filter {

xml {

source => "message"

store_xml => false

xpath => [

"/article/front/journal-meta/journal-title/text()", "[journal][title]",

"/article/front/journal-meta/publisher/publisher-name/text()", "publisher",

"/article/front/article-meta/article-id/@pub-id-type", "[identifier][type]",

"/article/front/article-meta/article-id/text()", "[identifier][value]",

"/article/front/article-meta/title-group/article-title/text()", "[article][title]",

"/article/front/article-meta/contrib-group/contrib/name/surname/text()", "[author][lastname]",

"/article/front/article-meta/contrib-group/contrib/name/given-names/text()", "[author][firstname]",

"/article/front/article-meta/pub-date['pub-type'='epub']/year/text()", "[journal][year_pub]",

"/article/front/article-meta/pub-date['pub-type'='epub']/month/text()", "[journal][month_pub]",

"/article/front/article-meta/pub-date['pub-type'='epub']/day/text()", "[journal][day_pub]",

"/article/front/article-meta/abstract/sec//text()|/article/front/article-meta/abstract//text()", "[article][abstract]"

]

}

elasticsearch {

add_field => {

"collection" => "pmc"

}

output {

#stdout { codec => rubydebug }

elasticsearch {

hosts => [ "localhost:9200" ]

index => "publications-en"

document_id => "%{collection}-%{[identifier][value][0]}"

}

Start ElasticSearch, LogStash and Filebeat

$sudo service elasticsearch start

$sudo /usr/share/logstash/bin/logstash -f /etc/logstash/conf.d/pmc-pipeline.conf --config.reload.automatic

$sudo /usr/share/filebeat/bin/filebeat -e -c /etc/filebeat/filebeat-pmc.yml -d "publish"

Retrieve documents containing the term 'diaminofluorescein'

http://localhost:9200/_search?q=diaminofluorescein&pretty=true&size=50

Page updated

Google Sites

Report abuse