Download samples from ftp://ftp.ncbi.nlm.nih.gov/pub/pmc
Store the pmc at ~/datasets/PMC
Unzip files
$for a in `ls -R -1 ~/datasets/PMC/*/*/*/*.tar.gz`; do tar -xzvf $a; done
The extracted files will be at PMC directory
Format with correct indentation
$for a in `ls -R -1 ~/datasets/PMC/*/*.nxml`; do xmllint --format $a --output $a.xml; done
Test xpath
$xmllint --xpath '//contrib' ~/datasets/PMC/PMC162170/bpo_v5_p136_m55.nxml.xml
Configure FileBeat
###################### Filebeat Configuration Example #########################
# This file is an example configuration file highlighting only the most common
# options. The filebeat.reference.yml file from the same directory contains all the
# supported options with more comments. You can use it as a reference.
#
# You can find the full configuration reference here:
# https://www.elastic.co/guide/en/beats/filebeat/index.html
# For more available modules and options, please see the filebeat.reference.yml sample
# configuration file.
#=========================== Filebeat prospectors =============================
filebeat.prospectors:
# Each - is a prospector. Most options can be set at the prospector level, so
# you can use different prospectors for various configurations.
# Below are the prospector specific configurations.
- type: log
# Change to true to enable this prospector configuration.
enabled: true
# Paths that should be crawled and fetched. Glob based paths.
paths:
- /home/duy/datasets/PMC/*/*.xml
#- c:\programdata\elasticsearch\logs\*
# Exclude lines. A list of regular expressions to match. It drops the lines that are
# matching any regular expression from the list.
#exclude_lines: ['^DBG']
# Include lines. A list of regular expressions to match. It exports the lines that are
# matching any regular expression from the list.
#include_lines: ['^ERR', '^WARN']
# Exclude files. A list of regular expressions to match. Filebeat drops the files that
# are matching any regular expression from the list. By default, no files are dropped.
#exclude_files: ['.gz$']
# Optional additional fields. These fields can be freely picked
# to add additional information to the crawled log files for filtering
#fields:
# level: debug
# review: 1
### Multiline options
# Mutiline can be used for log messages spanning multiple lines. This is common
# for Java Stack Traces or C-Line Continuation
# The regexp Pattern that has to be matched. The example pattern matches all lines starting with a specific pattern
multiline.pattern: '^<article'
# Defines if the pattern set under pattern should be negated or not. Default is false.
multiline.negate: true
# Match can be set to "after" or "before". It is used to define if lines should be append to a pattern
# that was (not) matched before or after or as long as a pattern is not matched based on negate.
# Note: After is the equivalent to previous and before is the equivalent to to next in Logstash
multiline.match: after
#============================= Filebeat modules ===============================
filebeat.config.modules:
# Glob pattern for configuration loading
path: ${path.config}/modules.d/*.yml
# Set to true to enable config reloading
reload.enabled: false
# Period on which files under path should be checked for changes
#reload.period: 10s
#==================== Elasticsearch template setting ==========================
setup.template.settings:
index.number_of_shards: 3
#index.codec: best_compression
#_source.enabled: false
#================================ General =====================================
# The name of the shipper that publishes the network data. It can be used to group
# all the transactions sent by a single shipper in the web interface.
#name:
# The tags of the shipper are included in their own field with each
# transaction published.
#tags: ["service-X", "web-tier"]
# Optional fields that you can specify to add additional information to the
# output.
#fields:
# env: staging
#============================== Dashboards =====================================
# These settings control loading the sample dashboards to the Kibana index. Loading
# the dashboards is disabled by default and can be enabled either by setting the
# options here, or by using the `-setup` CLI flag or the `setup` command.
#setup.dashboards.enabled: false
# The URL from where to download the dashboards archive. By default this URL
# has a value which is computed based on the Beat name and version. For released
# versions, this URL points to the dashboard archive on the artifacts.elastic.co
# website.
#setup.dashboards.url:
#============================== Kibana =====================================
# Starting with Beats version 6.0.0, the dashboards are loaded via the Kibana API.
# This requires a Kibana endpoint configuration.
setup.kibana:
# Kibana Host
# Scheme and port can be left out and will be set to the default (http and 5601)
# In case you specify and additional path, the scheme is required: http://localhost:5601/path
# IPv6 addresses should always be defined as: https://[2001:db8::1]:5601
#host: "localhost:5601"
#============================= Elastic Cloud ==================================
# These settings simplify using filebeat with the Elastic Cloud (https://cloud.elastic.co/).
# The cloud.id setting overwrites the `output.elasticsearch.hosts` and
# `setup.kibana.host` options.
# You can find the `cloud.id` in the Elastic Cloud web UI.
#cloud.id:
# The cloud.auth setting overwrites the `output.elasticsearch.username` and
# `output.elasticsearch.password` settings. The format is `<user>:<pass>`.
#cloud.auth:
#================================ Outputs =====================================
# Configure what output to use when sending the data collected by the beat.
#-------------------------- Elasticsearch output ------------------------------
#output.elasticsearch:
# Array of hosts to connect to.
# hosts: ["localhost:9200"]
# Optional protocol and basic auth credentials.
#protocol: "https"
#username: "elastic"
#password: "changeme"
#----------------------------- Logstash output --------------------------------
output.logstash:
# The Logstash hosts
hosts: ["localhost:5044"]
# Optional SSL. By default is off.
# List of root certificates for HTTPS server verifications
#ssl.certificate_authorities: ["/etc/pki/root/ca.pem"]
# Certificate for SSL client authentication
#ssl.certificate: "/etc/pki/client/cert.pem"
# Client Certificate Key
#ssl.key: "/etc/pki/client/cert.key"
#================================ Logging =====================================
# Sets log level. The default log level is info.
# Available log levels are: error, warning, info, debug
#logging.level: debug
# At debug level, you can selectively enable logging only for some components.
# To enable all selectors use ["*"]. Examples of other selectors are "beat",
# "publish", "service".
#logging.selectors: ["*"]
#============================== Xpack Monitoring ===============================
# filebeat can export internal metrics to a central Elasticsearch monitoring
# cluster. This requires xpack monitoring to be enabled in Elasticsearch. The
# reporting is disabled by default.
# Set to true to enable the monitoring reporter.
#xpack.monitoring.enabled: false
# Uncomment to send the metrics to Elasticsearch. Most settings from the
# Elasticsearch output are accepted here as well. Any setting that is not set is
# automatically inherited from the Elasticsearch output configuration, so if you
# have the Elasticsearch output configured, you can simply uncomment the
# following line.
#xpack.monitoring.elasticsearch:
Configure Logstash
input {
beats {
port => "5044"
}
}
# The filter part of this file is commented out to indicate that it is
# optional.
filter {
xml {
source => "message"
store_xml => false
xpath => [
"/article/front/journal-meta/journal-title/text()", "[journal][title]",
"/article/front/journal-meta/publisher/publisher-name/text()", "publisher",
"/article/front/article-meta/article-id/@pub-id-type", "[identifier][type]",
"/article/front/article-meta/article-id/text()", "[identifier][value]",
"/article/front/article-meta/title-group/article-title/text()", "[article][title]",
"/article/front/article-meta/contrib-group/contrib/name/surname/text()", "[author][lastname]",
"/article/front/article-meta/contrib-group/contrib/name/given-names/text()", "[author][firstname]",
"/article/front/article-meta/pub-date['pub-type'='epub']/year/text()", "[journal][year_pub]",
"/article/front/article-meta/pub-date['pub-type'='epub']/month/text()", "[journal][month_pub]",
"/article/front/article-meta/pub-date['pub-type'='epub']/day/text()", "[journal][day_pub]",
"/article/front/article-meta/abstract/sec//text()|/article/front/article-meta/abstract//text()", "[article][abstract]"
]
}
elasticsearch {
add_field => {
"collection" => "pmc"
}
}
}
output {
#stdout { codec => rubydebug }
elasticsearch {
hosts => [ "localhost:9200" ]
index => "publications-en"
document_id => "%{collection}-%{[identifier][value][0]}"
}
}
Start ElasticSearch, LogStash and Filebeat
$sudo service elasticsearch start
$sudo /usr/share/logstash/bin/logstash -f /etc/logstash/conf.d/pmc-pipeline.conf --config.reload.automatic
$sudo /usr/share/filebeat/bin/filebeat -e -c /etc/filebeat/filebeat-pmc.yml -d "publish"
Retrieve documents containing the term 'diaminofluorescein'
http://localhost:9200/_search?q=diaminofluorescein&pretty=true&size=50