Related
I am totally new to elastic search. So please forgive me if this is a stupid Question and my Questions might have been answered somewhere else already but I couldn't find it.
I want to use Elastic Search as a search engine for PDF'S and docx's in my network. I used fscrawler to ingest the PDF's to elastic search. Since the documents I want to ingest are in several languages I wanted to use n-graming for stemming. To do so I wanted to update my mapping like this
PUT test/_mappings/_all
{
"mappings": {
"title": {
"properties": {
"title": {
"type": "text",
"fields": {
"de": {
"type": "string",
"analyzer": "german"
},
"en": {
"type": "string",
"analyzer": "english"
},
"general": {
"type": "string",
"analyzer": "trigrams"
}
}
}
}
}
}
}
And now I get this Errormessage
{ "error": {
"root_cause": [
{
"type": "mapper_parsing_exception",
"reason": "Root mapping definition has unsupported parameters: [mappings : {title={properties={title={type=text,
fields={de={type=string, analyzer=german}, en={type=string,
analyzer=english}, general={type=string, analyzer=trigrams}}}}}}]"
}
],
"type": "mapper_parsing_exception",
"reason": "Root mapping definition has unsupported parameters: [mappings : {title={properties={title={type=text,
fields={de={type=string, analyzer=german}, en={type=string,
analyzer=english}, general={type=string, analyzer=trigrams}}}}}}]"
}, "status": 400 }
Do you have any idea how i can fix this? Or do you have an idea how I can ingest the files with the right mapping without using fscrawler?
those are my settings
{
"test": {
"settings": {
"index": {
"mapping": {
"total_fields": {
"limit": "2000"
}
},
"number_of_shards": "5",
"provided_name": "test",
"creation_date": "1542031632596",
"analysis": {
"filter": {
"trigrams_filter": {
"type": "ngram",
"min_gram": "3",
"max_gram": "3"
}
},
"analyzer": {
"fscrawler_path": {
"tokenizer": "fscrawler_path"
},
"trigrams": {
"filter": [
"lowercase",
"trigrams_filter"
],
"type": "custom",
"tokenizer": "standard"
}
},
"tokenizer": {
"fscrawler_path": {
"type": "path_hierarchy"
}
}
},
"number_of_replicas": "1",
"uuid": "7L3QE5_xRACECVbTFlFY-Q",
"version": {
"created": "6040399"
}
}
}
}
}
My mapping
{
"test": {
"mappings": {
"_doc": {
"dynamic_templates": [
{
"raw_as_text": {
"path_match": "meta.raw.*",
"mapping": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
}
}
}
],
"properties": {
"attachment": {
"type": "binary"
},
"attributes": {
"properties": {
"group": {
"type": "keyword"
},
"owner": {
"type": "keyword"
}
}
},
"content": {
"type": "text"
},
"file": {
"properties": {
"checksum": {
"type": "keyword"
},
"content_type": {
"type": "keyword"
},
"created": {
"type": "date",
"format": "dateOptionalTime"
},
"extension": {
"type": "keyword"
},
"filename": {
"type": "keyword",
"store": true
},
"filesize": {
"type": "long"
},
"indexed_chars": {
"type": "long"
},
"indexing_date": {
"type": "date",
"format": "dateOptionalTime"
},
"last_accessed": {
"type": "date",
"format": "dateOptionalTime"
},
"last_modified": {
"type": "date",
"format": "dateOptionalTime"
},
"url": {
"type": "keyword",
"index": false
}
}
},
"meta": {
"properties": {
"altitude": {
"type": "text"
},
"author": {
"type": "text"
},
"comments": {
"type": "text"
},
"contributor": {
"type": "text"
},
"coverage": {
"type": "text"
},
"created": {
"type": "date",
"format": "dateOptionalTime"
},
"creator_tool": {
"type": "keyword"
},
"date": {
"type": "date",
"format": "dateOptionalTime"
},
"description": {
"type": "text"
},
"format": {
"type": "text"
},
"identifier": {
"type": "text"
},
"keywords": {
"type": "text"
},
"language": {
"type": "keyword"
},
"latitude": {
"type": "text"
},
"longitude": {
"type": "text"
},
"metadata_date": {
"type": "date",
"format": "dateOptionalTime"
},
"modifier": {
"type": "text"
},
"print_date": {
"type": "date",
"format": "dateOptionalTime"
},
"publisher": {
"type": "text"
},
"rating": {
"type": "byte"
},
"relation": {
"type": "text"
},
"rights": {
"type": "text"
},
"source": {
"type": "text"
},
"title": {
"type": "text"
},
"type": {
"type": "text"
}
}
},
"path": {
"properties": {
"real": {
"type": "keyword",
"fields": {
"fulltext": {
"type": "text"
},
"tree": {
"type": "text",
"analyzer": "fscrawler_path",
"fielddata": true
}
}
},
"root": {
"type": "keyword"
},
"virtual": {
"type": "keyword",
"fields": {
"fulltext": {
"type": "text"
},
"tree": {
"type": "text",
"analyzer": "fscrawler_path",
"fielddata": true
}
}
}
}
}
}
}
}
}
}
env: ElasticSearch 5.5.1
First there are two indexs in my elasticsearch
and the only different of two index is the message field, the field's type of message in index1 is keyword, and in index2 is text.
To ensure that it is not affected by other fields,I remove the message field and compare before and after result:
Before remove message field:
after remove message field i got:
Obvious the message field takes up a lot of space,and the type of keyword take up much more than text,but I don't know why keyword take up much more size than text?
so, is there anyone help me ?
Following is the index of index1's mapping info:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"#agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"#timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
and settings info:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "3",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index1",
"creation_date": "1531389785215",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "zd8oVbwUQbys1UJ8hJZRmQ",
"version": {
"created": "5050099"
}
}
}
Following is the index of index2's mapping info:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"#agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"#timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"CommunicationReturnCode": {
"type": "keyword"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"action": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "text"
},
"msgid": {
"type": "keyword"
},
"msgname": {
"type": "keyword"
},
"nodetype": {
"type": "keyword"
},
"orgid": {
"type": "keyword"
},
"orgname": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"processingState": {
"type": "keyword"
},
"processingStatecode": {
"type": "keyword"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"thread": {
"type": "keyword"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"transDescription": {
"type": "keyword"
},
"transactionErrorCode": {
"type": "keyword"
},
"transactionTimeConsuming": {
"type": "keyword"
},
"transcode": {
"type": "keyword"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
and setting info:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "2",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index2",
"creation_date": "1531467294314",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "yROU2MrMTzip4VXH_zWEXQ",
"version": {
"created": "5050099"
}
}
}
Following are one of the index's file structure of the two shards about the text type field:
and the keyword type field:
And you can believe that there are same number of documents in two folder, and the only difference of the field is the type of message field.
Could you explain it?
Thank you so much!
In Elasticsearch keyword fields have doc_values enabled by default, while text fields does not. This means that on your keyword fields it will store the whole field in a column-oriented fashion, in order to be able to perform aggregations or sorting, without relying on fielddata.
Also, Once you tokenize a string, with stemming, lowercasing, etc, you can achieve much better compression.
You can try to disable doc_values on that field if you don't perform aggregations or sorting on it.
I have a problem with a query that return no result. When I execute the following query either with match or term :
{
"size": 1,
"query": {
"bool": {
"must": [
{ "term": { "ALERT_TYPE.raw": "ERROR" }}
],
"filter": [
{ "range": {
"#timestamp": {
"gte": "2018-02-01T00:00:01.000Z",
"lte": "2018-02-28T23:55:55.000Z"
}
}}
]
}
}
}
I always got the following response, :
{
"took": 92,
"timed_out": false,
"_shards": {
"total": 215,
"successful": 215,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
But i'm sure the element is present because when i do a match_all query, the first hit is the following :
{
"took": 269,
"timed_out": false,
"_shards": {
"total": 210,
"successful": 210,
"failed": 0
},
"hits": {
"total": 68292,
"max_score": 1,
"hits": [
{
"_index": "logstash-2018.02.22",
"_type": "alert",
"_id": "AWEdVphtJjppDZ0FiAz-",
"_score": 1,
"_source": {
"#version": "1",
"#timestamp": "2018-02-22T10:07:41.549Z",
"path": "/something",
"host": "host.host",
"type": "alert",
"SERVER_TYPE": "STANDALONE",
"LOG_FILE": "log.log",
"DATE": "2018-02-22 11:02:02,367",
"ALERT_TYPE": "ERROR",
"MESSAGE": "There is an error"
}
}
]
}
}
Here I can see the field is the value that I am expecting. And from the mapping I know the field is analyzed by the default analyser and the raw field is not analysed (Thanks to the answer of Glenn Van Schil). The mapping is generated dynamically by logstash but it looks like this for the type i'm looking into:
"alert": {
"_all": {
"enabled": true,
"omit_norms": true
},
"dynamic_templates": [
{
"message_field": {
"mapping": {
"index": "analyzed",
"omit_norms": true,
"fielddata": { "format": "disabled" },
"type": "string"
},
"match": "message",
"match_mapping_type": "string"
}
},
{
"string_fields": {
"mapping": {
"index": "analyzed",
"omit_norms": true,
"fielddata": { "format": "disabled" },
"type": "string",
"fields": {
"raw": {
"index": "not_analyzed",
"ignore_above": 256,
"type": "string"
}
}
},
"match": "*",
"match_mapping_type": "string"
}
}
],
"properties": {
"#timestamp": { "type": "date", "format": "strict_date_optional_time||epoch_millis" },
"#version": { "type": "string", "index": "not_analyzed" },
"ALERT_TYPE": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"DATE": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"LOG_FILE": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"MESSAGE": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"SERVER_TYPE": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"geoip": {
"dynamic": "true",
"properties": {
"ip": { "type": "ip" },
"latitude": { "type": "float" },
"location": { "type": "geo_point" },
"longitude": { "type": "float" }
}
},
"host": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"path": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
},
"type": {
"type": "string",
"norms": { "enabled": false },
"fielddata": { "format": "disabled" },
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
}
Does anyone have a clue about why this query keep returning nothing ? Maybe there is something in the mapping that i am missing which explain why the match or term query keep failing ? I'm running out of idea about what is happenning and i'm quite new to elasticsearch and logstash.
Versions of tools and environment :
OS: RHEL Server 6.5 (Santiago)
Java: 1.7.0_91
Elasticsearch: 2.4.6
Lucene: 5.5.4
Logstash: 2.4.1
This is not really an answer, but it was to complicated to write this as a comment.
from the mapping i know the field is not analysed.
You are searching for ALERT_TYPE, but this one is in fact analyzed with the default analyzer since you did not specify any analyzer directly under your ALERT_TYPE's mapping.
However, your ALERT_TYPE has an internal field named raw that is not analyzed. If you want to search documents using the raw field you'll need to change the query from
"must": [
{ "term": { "ALERT_TYPE": "ERROR" }}
]
to
"must": [
{ "term": { "ALERT_TYPE.raw": "ERROR" }}
]
I'm trying to figure out how the mapping work but can't get it right, I copied the logstash template to be used as my custom index name. However I'm getting the following issue:
MapperParsingException[failed to parse [data]]; nested: IllegalArgumentException[unknown property [customerId]];
at org.elasticsearch.index.mapper.FieldMapper.parse(FieldMapper.java:329)
at org.elasticsearch.index.mapper.DocumentParser.parseObjectOrField(DocumentParser.java:311)
at org.elasticsearch.index.mapper.DocumentParser.parseObject(DocumentParser.java:328)
at org.elasticsearch.index.mapper.DocumentParser.parseObject(DocumentParser.java:254)
at org.elasticsearch.index.mapper.DocumentParser.parseDocument(DocumentParser.java:124)
at org.elasticsearch.index.mapper.DocumentMapper.parse(DocumentMapper.java:309)
at org.elasticsearch.index.shard.IndexShard.prepareCreate(IndexShard.java:533)
at org.elasticsearch.index.shard.IndexShard.prepareCreateOnPrimary(IndexShard.java:510)
at org.elasticsearch.action.index.TransportIndexAction.prepareIndexOperationOnPrimary(TransportIndexAction.java:214)
at org.elasticsearch.action.index.TransportIndexAction.executeIndexRequestOnPrimary(TransportIndexAction.java:223)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardIndexOperation(TransportShardBulkAction.java:327)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:120)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:68)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryPhase.doRun(TransportReplicationAction.java:657)
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:287)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:279)
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:77)
at org.elasticsearch.transport.TransportService$4.doRun(TransportService.java:378)
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: unknown property [customerId]
at org.elasticsearch.index.mapper.core.StringFieldMapper.parseCreateFieldForString(StringFieldMapper.java:371)
at org.elasticsearch.index.mapper.core.StringFieldMapper.parseCreateField(StringFieldMapper.java:320)
at org.elasticsearch.index.mapper.FieldMapper.parse(FieldMapper.java:321)
... 22 more
I tried to ignore the data field which is actually an object so it will be processed and saved as raw string, below is the mapping template that I'm attempting to use.
{
"order": 0,
"template": "sl-prod-*",
"settings": {
"index": {
"refresh_interval": "5s"
}
},
"mappings": {
"_default_": {
"dynamic_templates": [
{
"message_field": {
"mapping": {
"fielddata": {
"format": "disabled"
},
"index": "analyzed",
"omit_norms": true,
"type": "string"
},
"match_mapping_type": "string",
"match": "message"
}
},
{
"string_fields": {
"mapping": {
"fielddata": {
"format": "disabled"
},
"index": "analyzed",
"omit_norms": true,
"type": "string",
"fields": {
"raw": {
"ignore_above": 256,
"index": "not_analyzed",
"type": "string"
},
"data": {
"ignore_above": 256,
"index": "not_analyzed",
"type": "string"
}
}
},
"match_mapping_type": "string",
"match": "*"
}
}
],
"_all": {
"omit_norms": true,
"enabled": true
},
"properties": {
"msg": {
"index": "not_analyzed",
"type": "string"
},
"#timestamp": {
"type": "date"
},
"geoip": {
"dynamic": true,
"properties": {
"ip": {
"type": "ip"
},
"latitude": {
"type": "float"
},
"location": {
"type": "geo_point"
},
"longitude": {
"type": "float"
}
}
},
"data": {
"index": "not_analyzed",
"type": "string"
},
"#version": {
"index": "not_analyzed",
"type": "string"
}
}
}
},
"aliases": {}
}
Any help will be appreciated ...
In your mapping, 'data' is a string that does not contain a customerID property.
See here for a similar issue: https://github.com/elastic/elasticsearch/issues/5084
I would like to index the SMTP receive log of my Exchange Server with ElasticSearch. So I created a logstash config file and it works very well but all of my fields are strings instead ip for source and target server for example. So I tried to change the default mapping in the logstash template:
I run the command curl -XGET http://localhost:9200/_template/logstash?pretty > C:\temp\logstashTemplate.txt
Edit the textfile and add my 'SourceIP' field
{
"template": "logstash-*",
"settings": {
"index": {
"refresh_interval": "5s"
}
},
"mappings": {
"_default_": {
"dynamic_templates": [{
"message_field": {
"mapping": {
"fielddata": {
"format": "disabled"
},
"index": "analyzed",
"omit_norms": true,
"type": "string"
},
"match_mapping_type": "string",
"match": "message"
}
}, {
"string_fields": {
"mapping": {
"fielddata": {
"format": "disabled"
},
"index": "analyzed",
"omit_norms": true,
"type": "string",
"fields": {
"raw": {
"ignore_above": 256,
"index": "not_analyzed",
"type": "string"
}
}
},
"match_mapping_type": "string",
"match": "*"
}
}],
"_all": {
"omit_norms": true,
"enabled": true
},
"properties": {
"#timestamp": {
"type": "date"
},
"geoip": {
"dynamic": true,
"properties": {
"ip": {
"type": "ip"
},
"latitude": {
"type": "float"
},
"location": {
"type": "geo_point"
},
"longitude": {
"type": "float"
}
}
},
"#version": {
"index": "not_analyzed",
"type": "string"
},
"SourceIP": {
"type": "ip"
}
}
}
},
"aliases": {}
}
I uploaded the edited template with the command curl -XPUT http://localhost:9200/_t
emplate/logstash -d#C:\temp\logstash.template
Restart the ElasticSearch server and index deleted/re-created
The 'SourceIP' field did not changed to type ip. What do I wrong? Can you please give me a hint? Thanks!