Load data from MySQL TO Elasticsearch - elasticsearch

Hi I am Using the following scirp file in lostash 2.X version I have over 186000 records in MySQL database table,but while running this .conf file only one document is loading in elastic search index.
input {
jdbc {
jdbc_connection_string => "jdbc:mysql://localhost/elasticsearch"
jdbc_user => "root"
jdbc_password => "empower"
#jdbc_validate_connection => true
jdbc_driver_library => "/home/wtc082/Documents/com.mysql.jdbc_5.1.5.jar"
jdbc_driver_class => "com.mysql.jdbc.Driver"
statement => "SELECT * FROM index_part_content_local;"
#schedule => "* * * * *"
#codec => "json"
}
}
output {
elasticsearch {
index => "mysqltest"
document_type => "mysqltest_type"
document_id => "%{id}"
hosts => "localhost:9200"
}
}
When i use this query only one document is index
GET mysqltest/_search
{
"query": {
"match_all": {}
}
}
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "mysqltest",
"_type": "mysqltest_type",
"_id": "%{id}",
"_score": 1,
"_source": {
"partnum": "",
"property1": "",
"property2": "",
"color": "",
"size": "",
"dim": "",
"thumburl": "",
"catid": "6575",
"subcatid": "3813",
"termid": "31999",
"longdesc": "<ul><li>Equipment and Parts<li>GC32-XD Parts<li>D/V Lock Plate Screw</ul>",
"hier1desc": "Heavy Duty Tools / Equipment",
"hier2desc": "Other Heavy Duty Equipment",
"hier3desc": "Hose Crimping Equipment & Accessories",
"aaiabrandid": "BBSC",
"aaiabrandname": "Gates",
"brandimageurl": "es-logo-sm.jpg",
"linecode": "GAT",
"descrp": "D/V Lock Plate Screw",
"#version": "1",
"#timestamp": "2016-12-20T09:16:40.075Z"
}
}
]
}
}

Ok, as you can see the ID of your document is the verbatim value "%{id}", which means that apparently you don't have any id column in your database and all records from your database are indexed under the same document id, hence why you only see one document.
In your elasticsearch output, you need to make sure to use a field that is the primary key of your table
document_id => "%{PRIMARY_KEY}"
Fix that and that will work.

Related

Logstash 2.3.4 How to load nested document in elasticsearch using logstash-jdbc plugin

I am currently using elasticsearch 2.3.4 and logstash 2.3.4 to load relational data from Oracle db into my elasticsearch index using logstash-jdbc plugin. As suggested in various posts, I am using aggregate filter for this. Still I am not able to load the inner nested object in the document. The values are not getting mapped to fields and are displayed as NULL.
I have two related entities with following data:
CREATE TABLE DEPARTMENT (
id NUMBER PRIMARY KEY,
name VARCHAR2(4000) NOT NULL
)
CREATE TABLE EMPLOYEE (
id NUMBER PRIMARY KEY,
name VARCHAR2(4000) NOT NULL,
departmentid NUMBER,
CONSTRAINT EMPLOYEE_FK FOREIGN KEY (departmentid) REFERENCES DEPARTMENT(id)
)
insert into DEPARTMENT values (1, 'dept1');
insert into DEPARTMENT values (2, 'dept2');
insert into DEPARTMENT values (3, 'dept3');
insert into DEPARTMENT values (4, 'dept4');
insert into EMPLOYEE values (1, 'emp1', 1);
insert into EMPLOYEE values (2, 'emp2', 1);
insert into EMPLOYEE values (3, 'emp3', 1);
insert into EMPLOYEE values (4, 'emp4', 2);
insert into EMPLOYEE values (5, 'emp5', 2);
insert into EMPLOYEE values (6, 'emp6', 3);`
Here is my mapping:
{
"mappings": {
"departments": {
"properties": {
"id": {
"type": "integer"
},
"deptName": {
"type": "string"
},
"employee_details": {
"type": "nested",
"properties": {
"empId": {
"type": "integer"
},
"empName": {
"type": "string"
}
}
}
}
}
}
}
And this is my logstash configuration:
input{
jdbc{
jdbc_validate_connection => true
jdbc_connection_string => "jdbc:oracle:thin:#host:port:db"
jdbc_user => "user"
jdbc_password => "pwd"
jdbc_driver_library => "../vendor/jar/ojdbc14.jar"
jdbc_driver_class => "Java::oracle.jdbc.driver.OracleDriver"
statement => "SELECT
department.id AS id,
department.name AS deptName,
employee.id AS empId,
employee.name AS empName
FROM department LEFT JOIN employee
ON department.id = employee.departmentid
ORDER BY id"
}
}
filter{
aggregate {
task_id => "%{id}"
code => "
map['id'] = event['id']
map['deptName'] = event['deptName'] #solution - deptName should be in smaller case and other fields too.
map['employee_details'] ||= []
map['employee_details'] << {'empId' => event['empId], 'empName' => event['empName'] }
"
push_previous_map_as_event => true
timeout => 5
timeout_tags => ['aggregated']
}
}
output{
stdout{ codec => rubydebug }
elasticsearch{
action => "index"
index => "my_index"
document_type => "departments"
document_id => "%{id}"
hosts => "localhost:9200"
}
}
When i perform a XGET on all documents:
curl -XGET 'localhost:9200/my_index/_search/?pretty=true&q=:
The values are not mapped to fields and displayed as NULL:
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "my_index",
"_type": "departments",
"_id": "2",
"_score": 1,
"_source": {
"id": 2,
"deptName": null,
"employee_details": [
{
"empId": null,
"empName": null
},
{
"empId": null,
"empName": null
}
],
"#version": "1",
"#timestamp": "2019-05-14T10:47:33.477Z",
"tags": [
"aggregated"
]
}
},
{
"_index": "my_index",
"_type": "departments",
"_id": "4",
"_score": 1,
"_source": {
"id": 4,
"deptname": "dept4",
"empid": null,
"empname": null,
"#version": "1",
"#timestamp": "2019-05-14T10:47:33.367Z",
"deptName": null,
"employee_details": [
{
"empId": null,
"empName": null
}
]
}
},
{
"_index": "my_index",
"_type": "departments",
"_id": "1",
"_score": 1,
"_source": {
"id": 1,
"deptName": null,
"employee_details": [
{
"empId": null,
"empName": null
},
{
"empId": null,
"empName": null
},
{
"empId": null,
"empName": null
}
],
"#version": "1",
"#timestamp": "2019-05-14T10:47:33.477Z",
"tags": [
"aggregated"
]
}
},
{
"_index": "my_index",
"_type": "departments",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"deptName": null,
"employee_details": [
{
"empId": null,
"empName": null
}
],
"#version": "1",
"#timestamp": "2019-05-14T10:47:33.492Z",
"tags": [
"aggregated"
]
}
}
]
}
}
rubydebug suggests the values are set to 'nil'. Could anyone please help me with what I am doing wrong here?
Here is a snippet from stdout for document with id = 1:
{
"id" => 1.0,
"deptname" => "dept1",
"empid" => 1.0,
"empname" => "emp1",
"#version" => "1",
"#timestamp" => "2019-05-14T12:32:14.272Z"
}
{
"id" => 1.0,
"deptname" => "dept1",
"empid" => 2.0,
"empname" => "emp2",
"#version" => "1",
"#timestamp" => "2019-05-14T12:32:15.272Z"
}
{
"id" => 1.0,
"deptname" => "dept1",
"empid" => 3.0,
"empname" => "emp3",
"#version" => "1",
"#timestamp" => "2019-05-14T12:32:15.272Z"
}
{
"id" => 1.0,
"deptName" => nil,
"employee_details" => [
[0] {
"empId" => nil,
"empName" => nil
},
[1] {
"empId" => nil,
"empName" => nil
},
[2] {
"empId" => nil,
"empName" => nil
}
],
"#version" => "1",
"#timestamp" => "2019-05-14T12:32:15.381Z",
"tags" => [
[0] "aggregated"
]
}
Following code works for me .
input {
jdbc{
jdbc_validate_connection => true
jdbc_connection_string => "----/employees"
jdbc_user => "---"
jdbc_password => "--"
jdbc_driver_library => "/home/ilsa/mysql-connector-java-5.1.36-bin.jar"
jdbc_driver_class => "com.mysql.jdbc.Driver"
statement => "SELECT
e.emp_no as employee_number,
birth_date, first_name, last_name, gender, hire_date, t.title AS titlename,
t.from_date AS titlefrom_date, t.to_date AS titleto_date, d.dept_no AS departmentnumber,
ds.dept_name AS departmentname, d.from_date AS departmentfrom_date, d.to_date AS departmentto_date
FROM employees e
LEFT JOIN(titles t, dept_emp d, departments ds)
ON(e.emp_no = t.emp_no AND e.emp_no = d.emp_no AND d.dept_no = ds.dept_no AND t.from_date < d.to_date AND t.to_date > d.from_date)
ORDER BY e.emp_no ASC"
}
}
filter {
aggregate {
task_id => "%{employee_number}"
code => "
map['employee_number'] = event.get('employee_number')
map['birth_date'] = event.get('birth_date')
map['first_name'] = event.get('first_name')
map['last_name'] = event.get('last_name')
map['gender'] = event.get('gender')
map['hire_date'] = event.get('hire_date')
map['roles'] ||= []
map['roles'] << {
'title.name' => event.get('titlename'),
'title.from_date' => event.get('titlefrom_date'),
'title.to_date' => event.get('titleto_date'),
'department.number' => event.get('departmentnumber'),
'department.name' => event.get('departmentname'),
'department.from_date' => event.get('departmentfrom_date'),
'department.to_date' => event.get('departmentto_date')
}
event.cancel()"
push_previous_map_as_event => true
timeout => 30
}
}
output {
stdout{ codec => rubydebug }
elasticsearch{
action => "index"
index => "employees"
document_type => "employee"
document_id => "%{employee_number}"
hosts => "localhost:9200"
}
}
You can also try to make use of jdbc streaming in logstash filter plugin.
Check this post
Inserting Nested Objects using Logstash
For example, I am taking Stackoverflow Posts and Users as an example. Here Post is parent table and it is associated with Users table on OwnerUserId. So my plugin configuration is
input {
jdbc {
jdbc_driver_library => "/usr/share/logstash/javalib/mssql-jdbc-8.2.2.jre11.jar"
jdbc_driver_class => "com.microsoft.sqlserver.jdbc.SQLServerDriver"
jdbc_connection_string => "jdbc:sqlserver://host.docker.internal;database=StackOverflow2010;user=pavan;password=pavankumar#123"
jdbc_user => "pavan"
jdbc_password => "pavankumar#123"
statement => "select top 500 * from StackOverflow2010.dbo.Posts p "
}
}
filter{
jdbc_streaming {
jdbc_driver_library => "/usr/share/logstash/javalib/mssql-jdbc-8.2.2.jre11.jar"
jdbc_driver_class => "com.microsoft.sqlserver.jdbc.SQLServerDriver"
jdbc_connection_string => "jdbc:sqlserver://host.docker.internal;database=StackOverflow2010;user=pavan;password=pavankumar#123"
jdbc_user => "pavan"
jdbc_password => "pavankumar#123"
statement => "select * from StackOverflow2010.dbo.Users u where u.Id = :owneruserid"
parameters => {"owneruserid" => "owneruserid"}
target => "user_details"
}
}
output {
elasticsearch {
hosts => ["http://elasticsearch:9200", "http://elasticsearch:9200"]
index => "stackoverflow_top_user"
}
stdout {
codec => rubydebug
}
}

Elasticsearch - Range query work wrong

I used kibana DEV Tools to query some range data,but there have 2 hits is out of my expectation,why it happens?
image of the range query
the query:
{
"query" : {
"constant_score" : {
"filter" : {
"range" : {
"rss" : {
"gte": 3000000
}
}
}
}
}
}
the result:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 69,
"successful": 69,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "BBQ",
"_type": "BBQ",
"_id": "AWGJaCYkk-tGbWgj2e6R",
"_score": 1,
"_source": {
"message": [
"nodeProcessInfo"
],
"#timestamp": "2018-02-12T09:45:59.525Z",
"rss": "92636",
"#version": "1",
"host": "192.168.213.96"
}
},
{
"_index": "BBQ",
"_type": "BBQ",
"_id": "AWGJaJxzk-tGbWgj2e-V",
"_score": 1,
"_source": {
"message": [
"nodeProcessInfo"
],
"#timestamp": "2018-02-12T09:46:29.680Z",
"rss": "85272",
"#version": "1",
"host": "192.168.213.96"
}
}
]
}
}
The result of range query is not in my expectation, why gte => 3000000 but rss = 92636 appeared?
======================edit at 2018.2.13=========(1)
the log like this:
"nodeProcessInfo|auth-server-1|auth|9618|1.9|1.2|98060|2018-2-12 6:33:43 PM|"
the filter like this:
filter {
if "nodeProcessInfo" in [message] {
mutate {
split => ["message", "|"]
add_field => {
"serverId" => "%{[message[1]]}"
}
add_field => {
"serverType" => "%{[message[2]]}"
}
add_field => {
"pid" => "%{[message[3]]}"
}
add_field => {
"cpuAvg" => "%{[message[4]]}"
}
add_field => {
"memAvg" => "%{[message[5]]}"
}
add_field => {
"rss" => "%{[message[6]]}"
}
add_field => {
"time" => "%{[message[7]]}"
}
convert => ["rss", "integer"] # I try convert rss to int, but failed
add_tag => "nodeProcessInfo"
}
}
}
======================edit at 2018.2.13=========(2)
I let the convert code in a new mutate, and it worked to make "rss" into int type,but the result of range query also wrong,the change code like this:
if "nodeProcessInfo" in [message] {
mutate {
split => ["message", "|"]
...
...
add_field => {
"rss" => "%{[message[6]]}"
}
}
mutate {
convert => ["rss", "integer"] # add a new mutate here
}
}
======================edit at 2018.2.13=========(3)
At last I found the reason why rss'type is converted to int but range query also wrong:
"You can't change existing mapping type, you need to create a new index with the correct mapping and index the data again."
so I create a new field name to instead of rss and the result of range query is right now.
Can you share the mapping of the index.
I thing the problem is as i can see in the search results which you have shared , the type of the rss field is text or string.
If it is so then the range query you are using is treating them as string characters and giving you results according to that.
And what you are trying to use is number ranges which will work if you index data with type of rss field as long and then fire the same query.
You would then get the desired reuslts

Wrong data shown when searched

i have a dataset of more than a million rows. I have integrated elasticsearch with Mysql using logstash.
When i type the following URL to fetch in postman,
http://localhost:9200/persondetails/Document/_search?q=*
i get the following:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "persondetails",
"_type": "Document",
"_id": "%{idDocument}",
"_score": 1,
"_source": {
"iddocument": 514697,
"#timestamp": "2017-08-31T05:18:46.916Z",
"author": "vaibhav",
"expiry_date": null,
"#version": "1",
"description": "ly that",
"creation_date": null,
"type": 1
}
},
{
"_index": "persondetails",
"_type": "Document_count",
"_id": "AV4o0J3OJ5ftvuhV7i0H",
"_score": 1,
"_source": {
"query": {
"term": {
"author": "rishav"
}
}
}
}
]
}
}
it is wrong as the number of rows in my table is more than 1 million and this shows that total is only 2. I am unable to find what is the mistake here.
when i type http://localhost:9200/_cat/indices?v
It shows this
health:yellow
status:open
index:persondetails
uuid:4FiGngZcQfS0Xvu6IeHIfg
pri:5
rep : 1
docs.count : 2
docs.deleted :1054
store.size : 125.4kb
pri.store.size : 125.4kb
This is my logstash.conf file
input {
jdbc {
jdbc_connection_string => "jdbc:mysql://127.0.0.1:3306/persondetails"
jdbc_user => "root"
jdbc_password => ""
schedule => "* * * * *"
jdbc_validate_connection => true
jdbc_driver_library => "/usr/local/Cellar/logstash/5.5.2/mysql-connector-java-3.1.14/mysql-connector-java-3.1.14-bin.jar"
jdbc_driver_class => "com.mysql.jdbc.Driver"
statement => "SELECT * FROM Document"
type => "persondetails"
}
}
output {
elasticsearch {
#protocol=>http
index =>"persondetails"
document_type => "Document"
document_id => "%{idDocument}"
hosts => ["http://localhost:9200"]
stdout{ codec => rubydebug}
}
}
From your result, it looks like there is an issue with your logstash configuration which is causing your document to be overwritten because the document_id is not getting generated, and effectively there is only one document in your index with document Id as "%{idDocument}"
See the following _source snippet from the result to the search query you provided:
"_source": {
"iddocument": 514697,
"#timestamp": "2017-08-31T05:18:46.916Z",
"author": "vaibhav",
"expiry_date": null,
"#version": "1",
"description": "ly that",
"creation_date": null,
"type": 1
}
Even looking at the small size of the index, it doesn't look like there are more documents. You should look at whether your jdbc input is providing the "idDocument" field.

using elasticsearch filter in logstash pipeline

I'm using the elasticsearch filter in my logstash pipeline. I correctly find the result using :
filter{
if [class] == "DPAPIINTERNAL" {
elasticsearch {
hosts => "10.1.10.16"
index => "dp_audit-2017.02.16"
query_template => "/home/vittorio/Documents/elastic-queries/matching-requestaw.json"
}
}
}
as you can see, Im using "query_template" which is :
{
"query": {
"query_string": {
"query": "class:DPAPI AND request.aw:%{[aw]}"
}
},
"_source": ["end_point", "vittorio"]
}
that tells elastichsearch to look up the log with that specific class that match "aw" with the DPAPIINTERNAL log.
Perfect! but now that i found the result, i want to add some field from it and attach them to my DPAPIINTERNAL log, for instance, i want to take "end_point" and add it in the new key "vittorio" inside my log.
This is not happening and I don't understand why.
here is the log that i'm looking at using the query:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "dp_audit-2017.02.16",
"_type": "logs",
"_id": "AVpHoPHPuEPlW12Qu",
"_score": 1,
"_source": {
"svc": "dp-1.1",
"request": {
"method": "POST|PATCH|DELETE",
"aw": "prova",
"end_point": "/bank/6311",
"app_instance": "7D1-D233-87E1-913"
},
"path": "/home/vittorio/Documents/dpapi1.json",
"#timestamp": "2017-02-16T15:53:33.214Z",
"#version": "1",
"host": "Vito",
"event": "bank.add",
"class": "DPAPI",
"ts": "2017-01-16T19:20:30.125+01:00"
}
}
]
}
}
Your need to specify the fields parameter in your elasticsearch filter, like this:
elasticsearch {
hosts => "10.1.10.16"
index => "dp_audit-2017.02.16"
query_template => "/home/vittorio/Documents/elastic-queries/matching-requestaw.json"
fields => { "[request][end_point]" => "vittorio" }
}
Note that since end_point is a nested field, you need to modify the _source in your query template like this:
"_source": ["request.end_point"]
the problem is simply that you don't have to specify the "new" field using the query_template.
"_source": ["request"] # here you specify the field you want from the query result.
and then
filter{
if [class] == "DPAPIINTERNAL" {
elasticsearch {
hosts => "10.1.10.16"
index => "dp_audit-2017.02.16"
query_template => "/home/vittorio/Documents/elastic-queries/matching-requestaw.json"
fields => {"request" => "new_key"} # here you add the fields and will tell elastich filter to put request inside new_key
}
}
}
That worked for me!

Script fails in upsert, with logstash output elasticsearch plugin

Environment
DB: Sybase
Logstash: 2.2.0 with JDBC Plugin, Elasticsearch Output plugin
SQL Query:
select res.id as 'res.id', res.name as 'res.name', tag.name as 'tag.name'
from Res res, ResTags rt, Tags tag
where res.id *= rt.resrow and rt.tagid *= tag.id
SQL Result:
res.id | res.name | tag.name
0 | result0 | null
0 | result0 | tagA
1 | result1 | tagA
1 | result1 | tagB
2 | result2 | tagA
2 | result2 | tagC
Index Mapping:
{
"mappings": {
"res": {
"properties": {
"id": { "type": "long"},
"name": { "type": "string" },
"tags": {
"type": "nested",
"properties": { "tagname": { "type": "string" }}
}
}
}
}
Conf File:
input {
jdbc {
jdbc_driver_library => "jtds-1.3.1.jar"
jdbc_driver_class => "Java::net.sourceforge.jtds.jdbc.Driver"
jdbc_connection_string => "jdbc:jtds:sybase://hostname.com:1234/schema"
jdbc_user => "george"
jdbc_password => "monkey"
jdbc_fetch_size => 100
statement_filepath => "/home/george/sql"
}
}
output {
elasticsearch {
action => "update"
index => "myres"
document_type => "res"
document_id => "%{res.id}"
script_lang => "groovy"
hosts => [ "my.other.host.com:5921" ]
upsert => ' {
"id" : %{res.id},
"name" : "%{res.name}",
"tags" :[{ "tagname": "%{tag.name}" }]
}'
script => '
if (ctx._source.res.tags.containsValue(null)) {
// if null has been added replace it with actual value
cts._source.res.tags = [{"tagname": "%{tag.name}" }];
else {
// if you find the tag, then do nothing
if (ctx._source.res.tags.containsValue("%{tag.name}")) {}
else {
// if the value you try to add is not null
if (%{tag.name} != null)
// add it as a new object into the tag array
ctx._source.res.tags += {"tagname": "%{tag.name}"};
}
}
'
}
}
The GOAL is to add the multiple rows returned from the database into ES, concatenating the tags as new objects (this is simplified example, so add_tag and filters do not do the job, as I have json structure deeper than 2 levels (nested of nested, etc))
The desired outcome after the bulk upload into ES would be:
{
"hits": {
"total": 3,
"max_score": 1,
"hits": [ {
"_index": "myres",
"_type": "res",
"_id": 0,
"_score": 1,
"_source": {
"res": {
"id":0,
"name": "result0",
"tags": [{"tagname": "tagA"}],
"#version": "2",
"#timestamp": "2016-xx-yy..."
}
},{
"_index": "myres",
"_type": "res",
"_id": 1,
"_score": 1,
"_source": {
"res": {
"id":1,
"name": "result1",
"tags": [{"tagname": "tagA"},{"tagname": "tagB"}],
"#version": "2",
"#timestamp": "2016-xx-yy..."
}
}{
"_index": "myres",
"_type": "res",
"_id": 2,
"_score": 1,
"_source": {
"res": {
"id":2,
"name": "result2",
"tags": [{"tagname": "tagA"},{"tagname": "tagC"],
"#version": "2",
"#timestamp": "2016-xx-yy..."
}
}
}
...
ISSUE: if in the conf, output section the script is not commented out, the below error pops out. If the script is not included, then only the initial tags (as expected) are imported, and the second ones are not.
It looks like script is not working within elasticsearch output.
ERROR message:
[400] {"error":"ActionRequestValidationException[Validation Failed:
1: script or doc is missing;
2: script or doc is missing;
3: script or doc is missing;],"status":400]} {:class=> ... bla bla ...}
NOTES
To avoid wasting peoples' time, doc_as_upsert => true also does not work as expected. It just keeps on updating / overwriting and just keeps the latest row of the db.
Also, the river plugin for jdbc to ES does not support nested of nested structure so that does not work eithe

Resources