Update Mapping of existing Index in Elasticsearch - elasticsearch

I am totally new to elastic search. So please forgive me if this is a stupid Question and my Questions might have been answered somewhere else already but I couldn't find it.
I want to use Elastic Search as a search engine for PDF'S and docx's in my network. I used fscrawler to ingest the PDF's to elastic search. Since the documents I want to ingest are in several languages I wanted to use n-graming for stemming. To do so I wanted to update my mapping like this
PUT test/_mappings/_all
{
"mappings": {
"title": {
"properties": {
"title": {
"type": "text",
"fields": {
"de": {
"type": "string",
"analyzer": "german"
},
"en": {
"type": "string",
"analyzer": "english"
},
"general": {
"type": "string",
"analyzer": "trigrams"
}
}
}
}
}
}
}
And now I get this Errormessage
{ "error": {
"root_cause": [
{
"type": "mapper_parsing_exception",
"reason": "Root mapping definition has unsupported parameters: [mappings : {title={properties={title={type=text,
fields={de={type=string, analyzer=german}, en={type=string,
analyzer=english}, general={type=string, analyzer=trigrams}}}}}}]"
}
],
"type": "mapper_parsing_exception",
"reason": "Root mapping definition has unsupported parameters: [mappings : {title={properties={title={type=text,
fields={de={type=string, analyzer=german}, en={type=string,
analyzer=english}, general={type=string, analyzer=trigrams}}}}}}]"
}, "status": 400 }
Do you have any idea how i can fix this? Or do you have an idea how I can ingest the files with the right mapping without using fscrawler?

those are my settings
{
"test": {
"settings": {
"index": {
"mapping": {
"total_fields": {
"limit": "2000"
}
},
"number_of_shards": "5",
"provided_name": "test",
"creation_date": "1542031632596",
"analysis": {
"filter": {
"trigrams_filter": {
"type": "ngram",
"min_gram": "3",
"max_gram": "3"
}
},
"analyzer": {
"fscrawler_path": {
"tokenizer": "fscrawler_path"
},
"trigrams": {
"filter": [
"lowercase",
"trigrams_filter"
],
"type": "custom",
"tokenizer": "standard"
}
},
"tokenizer": {
"fscrawler_path": {
"type": "path_hierarchy"
}
}
},
"number_of_replicas": "1",
"uuid": "7L3QE5_xRACECVbTFlFY-Q",
"version": {
"created": "6040399"
}
}
}
}
}

My mapping
{
"test": {
"mappings": {
"_doc": {
"dynamic_templates": [
{
"raw_as_text": {
"path_match": "meta.raw.*",
"mapping": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
}
}
}
],
"properties": {
"attachment": {
"type": "binary"
},
"attributes": {
"properties": {
"group": {
"type": "keyword"
},
"owner": {
"type": "keyword"
}
}
},
"content": {
"type": "text"
},
"file": {
"properties": {
"checksum": {
"type": "keyword"
},
"content_type": {
"type": "keyword"
},
"created": {
"type": "date",
"format": "dateOptionalTime"
},
"extension": {
"type": "keyword"
},
"filename": {
"type": "keyword",
"store": true
},
"filesize": {
"type": "long"
},
"indexed_chars": {
"type": "long"
},
"indexing_date": {
"type": "date",
"format": "dateOptionalTime"
},
"last_accessed": {
"type": "date",
"format": "dateOptionalTime"
},
"last_modified": {
"type": "date",
"format": "dateOptionalTime"
},
"url": {
"type": "keyword",
"index": false
}
}
},
"meta": {
"properties": {
"altitude": {
"type": "text"
},
"author": {
"type": "text"
},
"comments": {
"type": "text"
},
"contributor": {
"type": "text"
},
"coverage": {
"type": "text"
},
"created": {
"type": "date",
"format": "dateOptionalTime"
},
"creator_tool": {
"type": "keyword"
},
"date": {
"type": "date",
"format": "dateOptionalTime"
},
"description": {
"type": "text"
},
"format": {
"type": "text"
},
"identifier": {
"type": "text"
},
"keywords": {
"type": "text"
},
"language": {
"type": "keyword"
},
"latitude": {
"type": "text"
},
"longitude": {
"type": "text"
},
"metadata_date": {
"type": "date",
"format": "dateOptionalTime"
},
"modifier": {
"type": "text"
},
"print_date": {
"type": "date",
"format": "dateOptionalTime"
},
"publisher": {
"type": "text"
},
"rating": {
"type": "byte"
},
"relation": {
"type": "text"
},
"rights": {
"type": "text"
},
"source": {
"type": "text"
},
"title": {
"type": "text"
},
"type": {
"type": "text"
}
}
},
"path": {
"properties": {
"real": {
"type": "keyword",
"fields": {
"fulltext": {
"type": "text"
},
"tree": {
"type": "text",
"analyzer": "fscrawler_path",
"fielddata": true
}
}
},
"root": {
"type": "keyword"
},
"virtual": {
"type": "keyword",
"fields": {
"fulltext": {
"type": "text"
},
"tree": {
"type": "text",
"analyzer": "fscrawler_path",
"fielddata": true
}
}
}
}
}
}
}
}
}
}

Related

Problem with rotating (ILM) Cloudflare indices on ELK cluster

The problem I have is that my Cloudflare indices report the following ILM errors:
on index with alias: illegal_argument_exception: rollover target [cloudflare] does not point to a write index
on index without alias: illegal_argument_exception: index.lifecycle.rollover_alias [cloudflare] does not point to index [cloudflare-2022.08.13-000001]
Basically what I was able to find out is that when a new index is created, it doesn't receive the alias from rollover_alias:
{
"settings": {
"index": {
"lifecycle": {
"name": "cloudflare",
"rollover_alias": "cloudflare"
},
option which makes the rollover fail. When I assign the alias manually to all indices affected, rollover and ILM starts to work again but I want to understand why does it happen and find out a permanent solution to this problem. Otherwise I will have to check this cluster manually and force moving the data from HOT to WARM nodes when the HOT storage fills up.
The setup on Cloudflare is based on this guide, in other words Cloudflare pushes the logs to S3 bucket, then AWS Lambda pushes them to ELK (elastic.co).
Cloudflare index template in question:
"cloudflare": {
"index_patterns": [
"cloudflare-*"
],
"mappings": {
"properties": {
"observer.ip": {
"type": "ip"
},
"cloudflare.parent.ray_id": {
"type": "keyword"
},
"cloudflare.worker.subrequest_count": {
"type": "long"
},
"cloudflare.origin.ip": {
"type": "ip"
},
"cloudflare.edge.rate.limit.id": {
"type": "long"
},
"user_agent.version": {
"type": "keyword"
},
"cloudflare.device.type": {
"type": "keyword"
},
"cloudflare.edge.pathing.op": {
"type": "keyword"
},
"user_agent.os.version": {
"type": "keyword"
},
"source.port": {
"type": "long"
},
"cloudflare.edge.server.ip": {
"type": "ip"
},
"cloudflare.security_level": {
"type": "keyword"
},
"observer.vendor": {
"type": "keyword"
},
"event.dataset": {
"type": "keyword"
},
"cloudflare.worker.cpu_time": {
"type": "long"
},
"http.response.status_code": {
"type": "long"
},
"user_agent.minor": {
"type": "keyword"
},
"cloudflare.cache.response.status": {
"type": "long"
},
"user_agent.patch": {
"type": "keyword"
},
"#timestamp": {
"type": "date"
},
"cloudflare.edge.colo.id": {
"type": "integer"
},
"user_agent.os.full": {
"type": "keyword"
},
"source.address": {
"type": "keyword"
},
"user_agent.build": {
"type": "keyword"
},
"source.as.number": {
"type": "long"
},
"cloudflare.edge.start.timestamp": {
"type": "date"
},
"cloudflare.waf.rule.id": {
"type": "keyword"
},
"cloudflare.origin.ssl.protocol": {
"type": "keyword"
},
"http.request.bytes": {
"type": "long"
},
"source.geo.country_iso_code": {
"type": "keyword"
},
"cloudflare.edge.pathing.src": {
"type": "keyword"
},
"cloudflare.edge.response.bytes": {
"type": "long"
},
"cloudflare.edge.response.status": {
"type": "long"
},
"cloudflare.waf.rule.message": {
"type": "keyword"
},
"cloudflare.origin.response.time": {
"type": "long"
},
"url.path": {
"fields": {
"path": {
"index": true,
"eager_global_ordinals": false,
"fielddata": false,
"index_options": "positions",
"index_phrases": false,
"norms": true,
"type": "text",
"store": false
}
},
"type": "keyword"
},
"cloudflare.edge.response.compression_ratio": {
"type": "float"
},
"cloudflare.worker.subrequest": {
"type": "boolean"
},
"cloudflare.cache.response.bytes": {
"type": "long"
},
"cloudflare.waf.profile": {
"type": "keyword"
},
"cloudflare.waf.flags": {
"type": "keyword"
},
"cloudflare.firewall.matches.actions": {
"type": "keyword"
},
"cloudflare.http.response.status_code": {
"type": "long"
},
"user_agent.os.platform": {
"type": "keyword"
},
"cloudflare.waf.matched_var": {
"type": "keyword"
},
"user_agent.os_minor": {
"type": "keyword"
},
"cloudflare.worker.status": {
"type": "keyword"
},
"#version": {
"type": "keyword"
},
"cloudflare.firewall.matches.rule_ids": {
"type": "keyword"
},
"user_agent.os_major": {
"type": "keyword"
},
"cloudflare.origin.response.bytes": {
"type": "long"
},
"source.ip": {
"type": "ip"
},
"http.response.bytes": {
"type": "long"
},
"cloudflare.client.ssl.protocol": {
"type": "keyword"
},
"url.full": {
"type": "keyword"
},
"client.address": {
"type": "keyword"
},
"user_agent.os_name": {
"type": "keyword"
},
"cloudflare.edge.end.timestamp": {
"type": "date"
},
"cloudflare.origin.response.http.last_modified": {
"ignore_malformed": true,
"type": "date"
},
"user_agent.original": {
"type": "keyword"
},
"cloudflare.cache.tiered.fill": {
"type": "boolean"
},
"cloudflare.origin.response.http.expires": {
"type": "date",
"format": "E, d MMM uuuu HH:mm:ss 'UTC'"
},
"user_agent.name": {
"type": "keyword"
},
"cloudflare.waf.action": {
"type": "keyword"
},
"cloudflare.cache.status": {
"type": "keyword"
},
"cloudflare.edge.request.host": {
"type": "keyword"
},
"source.geo": {
"type": "object",
"properties": {
"region_code": {
"type": "keyword"
},
"longitude": {
"type": "float"
},
"region_iso_code": {
"type": "keyword"
},
"region_name": {
"type": "keyword"
},
"country_code2": {
"type": "keyword"
},
"ip": {
"type": "ip"
},
"continent_code": {
"type": "keyword"
},
"postal_code": {
"type": "keyword"
},
"country_code3": {
"type": "keyword"
},
"latitude": {
"type": "float"
},
"city_name": {
"type": "keyword"
},
"dma_code": {
"type": "long"
},
"country_name": {
"type": "keyword"
},
"continent_name": {
"type": "keyword"
},
"timezone": {
"type": "keyword"
},
"location": {
"type": "geo_point"
}
}
},
"cloudflare.edge.rate.limit.action": {
"type": "keyword"
},
"cloudflare.client.ssl.cipher": {
"type": "keyword"
},
"user_agent.os.name": {
"type": "keyword"
},
"cloudflare.edge.pathing.status": {
"type": "keyword"
},
"cloudflare.zone_id": {
"type": "integer"
},
"client.port": {
"type": "long"
},
"observer.type": {
"type": "keyword"
},
"http.request.referrer": {
"type": "keyword"
},
"user_agent.major": {
"type": "keyword"
},
"event.end": {
"type": "date"
},
"cloudflare.client.request.protocol": {
"type": "keyword"
},
"user_agent.device.name": {
"type": "keyword"
},
"destination.ip": {
"type": "ip"
},
"url.domain": {
"type": "keyword"
},
"http.request.method": {
"type": "keyword"
},
"cloudflare.firewall.matches.sources": {
"type": "keyword"
},
"cloudflare.edge.response.content_type": {
"type": "keyword"
},
"cloudflare.ray_id": {
"type": "keyword"
},
"event.start": {
"type": "date"
},
"ecs.version": {
"type": "keyword"
},
"client.ip": {
"type": "ip"
},
"cloudflare.edge.colo.code": {
"type": "keyword"
},
"http.version": {
"type": "keyword"
},
"cloudflare.client.ip.class": {
"type": "keyword"
},
"server.ip": {
"type": "ip"
},
"user_agent.os.kernel": {
"type": "keyword"
}
}
},
"aliases": {},
"order": 0,
"settings": {
"index": {
"number_of_replicas": "1",
"mapping": {
"ignore_malformed": "true"
},
"number_of_shards": "1",
"lifecycle": {
"rollover_alias": "cloudflare",
"name": "cloudflare"
},
"routing": {
"allocation": {
"include": {
"_tier_preference": null
}
}
}
}
}
}
}
ILM Policy in quesion:
{
"cloudflare": {
"policy": {
"phases": {
"cold": {
"actions": {
"set_priority": {
"priority": 0
}
},
"min_age": "30d"
},
"warm": {
"actions": {
"set_priority": {
"priority": 50
}
},
"min_age": "0ms"
},
"hot": {
"actions": {
"rollover": {
"max_age": "1d"
},
"set_priority": {
"priority": 100
}
},
"min_age": "0ms"
},
"delete": {
"actions": {
"delete": {
"delete_searchable_snapshot": true
}
},
"min_age": "60d"
}
}
},
"modified_date": "2021-11-02T17:18:34.417Z",
"in_use_by": {
"indices": [
"cloudflare-2022.07.09-000001",
"cloudflare-2022.07.08-000001",
"cloudflare-2022.07.04-000001",
"cloudflare-2022.07.06-000001",
"cloudflare-2022.07.07-000001",
"cloudflare-2022.07.05-000001",
"cloudflare-2022.06.10-000001",
"cloudflare-2022.06.12-000001",
"cloudflare-2022.06.11-000001",
"cloudflare-2022.06.13-000001",
"cloudflare-2022.08.02-000001",
"cloudflare-2022.08.03-000001",
"cloudflare-2022.08.01-000001",
"cloudflare-2022.08.04-000001",
"cloudflare-2022.08.08-000001",
"cloudflare-2022.06.18-000001",
"cloudflare-2022.08.06-000001",
"cloudflare-2022.06.07-000001",
"cloudflare-2022.06.16-000001",
"cloudflare-2022.06.14-000001",
"cloudflare-2022.06.09-000001",
"cloudflare-2022.06.05-000001",
"cloudflare-2022.06.03-000001",
"cloudflare-2022.05.23-000001",
"cloudflare-2022.05.21-000001",
"cloudflare-2022.07.02-000001",
"cloudflare-2022.07.11-000001",
"cloudflare-2022.07.13-000001",
"cloudflare-2022.08.01-000017",
"cloudflare-2022.07.17-000001",
"cloudflare-2022.07.18-000001",
"cloudflare-2022.05.28-000001",
"cloudflare-2022.05.27-000001",
"cloudflare-2022.05.24-000001",
"cloudflare-2022.06.01-000001",
"cloudflare-2022.06.22-000001",
"cloudflare-2022.08.02-000023",
"cloudflare-2022.08.03-000024",
"cloudflare-2022.08.02-000021",
"cloudflare-2022.06.23-000001",
"cloudflare-2022.08.02-000022",
"cloudflare-2022.08.12-000001",
"cloudflare-2022.08.06-000027",
"cloudflare-2022.08.13-000001",
"cloudflare-2022.08.07-000028",
"cloudflare-2022.06.19-000001",
"cloudflare-2022.08.16-000001",
"cloudflare-2022.06.26-000001",
"cloudflare-2022.08.09-000001",
"cloudflare-2022.08.05-000001",
"cloudflare-2022.08.02-000020",
"cloudflare-2022.06.15-000001",
"cloudflare-2022.05.20-000001",
"cloudflare-2022.06.08-000001",
"cloudflare-2022.07.10-000001",
"cloudflare-2022.06.04-000001",
"cloudflare-2022.07.03-000001",
"cloudflare-2022.05.31-000001",
"cloudflare-2022.07.14-000001",
"cloudflare-2022.07.25-000004",
"cloudflare-2022.07.21-000001",
"cloudflare-2022.07.25-000001",
"cloudflare-2022.08.02-000018",
"cloudflare-2022.08.02-000019",
"cloudflare-2022.07.29-000001",
"cloudflare-2022.07.26-000001",
"cloudflare-2022.07.27-000009",
"cloudflare-2022.07.30-000015",
"cloudflare-2022.07.30-000014",
"cloudflare-2022.07.31-000016",
"cloudflare-2022.07.30-000013",
"cloudflare-2022.07.27-000010",
"cloudflare-2022.06.30-000001",
"cloudflare-2022.07.28-000011",
"cloudflare-2022.08.17-000001",
"cloudflare-2022.07.29-000012",
"cloudflare-2022.06.27-000001",
"cloudflare-2022.06.29-000001",
"cloudflare-2022.06.25-000001",
"cloudflare-2022.05.30-000001",
"cloudflare-2022.07.26-000008",
"cloudflare-2022.07.22-000001",
"cloudflare-2022.07.26-000007",
"cloudflare-2022.07.31-000001",
"cloudflare-2022.07.26-000006",
"cloudflare-2022.07.24-000001",
"cloudflare-2022.07.26-000005",
"cloudflare-2022.07.20-000001",
"cloudflare-2022.07.24-000003",
"cloudflare-2022.07.28-000001",
"cloudflare-2022.05.29-000001",
"cloudflare-2022.07.16-000001",
"cloudflare-2022.07.19-000001",
"cloudflare-2022.07.15-000001",
"cloudflare-2022.08.09-000030",
"cloudflare-2022.05.25-000001",
"cloudflare-2022.05.26-000001",
"cloudflare-2022.06.02-000001",
"cloudflare-2022.06.21-000001",
"cloudflare-2022.06.20-000001",
"cloudflare-2022.06.24-000001",
"cloudflare-2022.08.05-000026",
"cloudflare-2022.08.04-000025",
"cloudflare-2022.08.14-000001",
"cloudflare-2022.08.10-000001",
"cloudflare-2022.08.15-000001",
"cloudflare-2022.08.11-000001",
"cloudflare-2022.08.08-000029",
"cloudflare-2022.08.07-000001",
"cloudflare-2022.06.28-000001",
"cloudflare-2022.06.17-000001",
"cloudflare-2022.06.06-000001",
"cloudflare-2022.05.22-000001",
"cloudflare-2022.07.01-000001",
"cloudflare-2022.07.12-000001",
"cloudflare-2022.07.30-000001",
"cloudflare-2022.07.27-000001",
"cloudflare-2022.07.23-000001",
"cloudflare-2022.07.23-000002"
],
"data_streams": [],
"composable_templates": []
},
"version": 12
}
}
Elastic version: v7.16.2 provided by elastic.co on AWS

Elasticsearch query for all values of field with group by

i am having trouble forming query to fetch all values with sql group by kind of thing.
so below is my data structure:
product index:
{
"createdBy" : "61c1fcdd88dbad1920da8caf",
"creationTime" : "2021-12-22T11:58:53.576932Z",
"lastModifiedBy" : "61c1fcdd88dbad1920da8caf",
"lastModificationTime" : "2021-12-22T11:58:53.576932Z",
"id" : "61c312fdc6aa620a609db0b2",
"title" : "string",
"brand" : "string",
"longDesc" : "string",
"categoryId" : "string",
"imageUrls" : [
"string",
"string"
],
"keySpecs" : [
"string",
"string",
],
"facets" : [
{
"name" : "color",
"value" : "red"
},
{
"name" : "storage",
"value" : "16 GB"
},
{
"name" : "brand",
"value" : "Intex"
}
],
"categoryName" : "handsets"
}
Now, i want to fetch all the facets with their different values and count as well. Let's say
productA has color blue, productB has color red
productA has brand ABC, productB has brand XYZ
so, i want data which list all facets like:
color: blue(200 count), red (12 count)
brand: ABC(13 count), XYZ (99 count)
Also, different product will have different type of facet, like iphone will have color memory brand size, but a pen will have color and brand only (not memory/size).
Note: i'm using latest version of elastic
=================
UPDATE 1:
Below is the es mapping details
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": [
"example"
]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"lalashree_standard_analyzer": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
},
"html_standard_analyzer": {
"char_filter": [
"html_strip"
],
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "keyword"
},
"createdBy": {
"type": "keyword"
},
"creationTime": {
"type": "date"
},
"lastModifiedBy": {
"type": "keyword"
},
"lastModificationTime": {
"type": "date"
},
"deleted": {
"type": "boolean"
},
"deletedBy": {
"type": "keyword"
},
"deletionTime": {
"type": "date"
},
"title": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"suggest": {
"type": "completion"
}
}
},
"shortDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"longDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"categoryId": {
"type": "keyword"
},
"searchDetails": {
"type": "object",
"properties": {
"desc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"keywords": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"imageUrls": {
"type": "keyword",
"index": false
},
"keySpecs": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"sections": {
"type": "object",
"properties": {
"name": {
"type": "text",
"index": false
},
"shortDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"longDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"htmlContent": {
"type": "text",
"analyzer": "html_standard_analyzer"
}
}
},
"facets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"specificationItems": {
"type": "object",
"properties": {
"key": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"values": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
}
}
},
"categoryName": {
"type": "keyword"
},
"productFamily": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"familyVariantOptions": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"values": {
"type": "keyword"
}
}
},
"productFamilyItems": {
"type": "nested",
"properties": {
"baseProductId": {
"type": "keyword"
},
"itemVariantInfoSet": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
}
}
}
},
"rating": {
"type": "float"
},
"totalReviewsCount": {
"type": "long"
},
"stores": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"logo": {
"type": "keyword",
"index": false
},
"active": {
"type": "boolean"
},
"name": {
"type": "text"
},
"quantity": {
"type": "long"
},
"rating": {
"type": "float"
},
"totalReviewsCount": {
"type": "long"
},
"price.mrp": {
"type": "float"
},
"price.sp": {
"type": "float"
},
"location.geoPoint": {
"type": "geo_point"
},
"oos": {
"type": "boolean"
}
}
}
}
}
}
This query first group by names then groups each name's values. By setting sizes, you can arrange number of facets you want and number of items in each facet. I think it does what you need.
Note that if you have too many documents and if performance matters, this query may perform bad.
{
"size": 0,
"aggs": {
"facets": {
"nested": {
"path": "facets"
},
"aggs": {
"names": {
"terms": {
"field": "facets.name",
"size": 10
},
"aggs": {
"values": {
"terms": {
"field": "facets.value",
"size": 10
}
}
}
}
}
}
}
}

Is there a character limit on an individual word within a match phrase query in elastic search?

Fairly new to Elastic Search so may have to bare with me, I'm running into a problem where if I search for a document using 20 characters or less, the document appears, however any more characters within the same word within the query, I get no results:
Using 'phenoxymethylpenicillin' brings no documents.
Using 'phenoxymethylpenicil' brings back documents.
This is the query I'm trying to use:
{
"match_phrase": {
"genericNames.name": {
"query": "phenoxymethylpenicillin",
"slop": 15,
"zero_terms_query": "NONE",
"boost": 1.0
}
}
}
Here is the full query: https://pastebin.com/DEJvP2uS
Like I said, I'm fairly new to this, it may be a point of not looking in the correct area.
So my question is, what possible areas would cause this and why?
Thanks!
Edit:
Provided is an extract from one of the documents from the sample data. I can't show a lot of it due a lot of it being sensitive, luckily the names from sample data I can share. This is from the data I'm trying to search for:
"genericNames":[
{
"nameType":1,
"name":"Phenoxymethylpenicillin 250mg tablets",
"nameChangeCode":"0000",
"nameBasisCode":"0001",
"nameTypeDescription":"Name",
"startDate":"1948-01-01T00:00:00.000000+0000",
"endDate":"3456-02-01T00:00:00.000000+0000"
},
{
"nameType":5,
"name":"Penicillin V 250mg tablets",
"nameTypeDescription":"Alternative Name 3",
"startDate":"1948-01-01T00:00:00.000000+0000",
"endDate":"3456-02-01T00:00:00.000000+0000"
}
],
I have also provided the index mapping as it may provide extra information:
{
"amp": {
"mappings": {
"properties": {
"_class": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ampId": {
"type": "long"
},
"amppId": {
"type": "long"
},
"attributes": {
"type": "nested",
"properties": {
"attributeQualifier": {
"type": "keyword"
},
"attributeType": {
"type": "integer"
},
"attributeTypeDescription": {
"type": "keyword"
},
"attributeValue": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"countryId": {
"type": "long"
},
"decodedValue": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"dictionaries": {
"type": "nested",
"properties": {
"abbreviation": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"description": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"dictId": {
"type": "integer"
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"excipients": {
"type": "nested",
"properties": {
"basisOfStrengthCode": {
"type": "keyword"
},
"bossId": {
"type": "long"
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"id": {
"type": "long"
},
"ingredientNames": {
"properties": {
"endDate": {
"type": "date"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"startDate": {
"type": "date"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"strengthDenominatorUnitOfMeasureCode": {
"type": "keyword"
},
"strengthDenominatorValue": {
"type": "keyword"
},
"strengthNumeratorUnitOfMeasureCode": {
"type": "keyword"
},
"strengthNumeratorValue": {
"type": "keyword"
},
"strengthVal": {
"type": "keyword"
},
"unitOfMeasure": {
"type": "keyword"
}
}
},
"extractableEntry": {
"type": "boolean"
},
"genericNames": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"name": {
"type": "text",
"ignore_above": 256,
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "autocomplete_index",
"search_analyzer": "autocomplete_search"
},
"nameBasisCode": {
"type": "keyword"
},
"nameChangeCode": {
"type": "keyword"
},
"nameType": {
"type": "integer"
},
"nameTypeDescription": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"id": {
"type": "keyword"
},
"ingredients": {
"type": "nested",
"properties": {
"basisOfStrengthCode": {
"type": "keyword"
},
"bossId": {
"type": "long"
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"id": {
"type": "long"
},
"ingredientNames": {
"properties": {
"endDate": {
"type": "date"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"startDate": {
"type": "date"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"strengthDenominatorUnitOfMeasureCode": {
"type": "keyword"
},
"strengthDenominatorValue": {
"type": "keyword"
},
"strengthNumeratorUnitOfMeasureCode": {
"type": "keyword"
},
"strengthNumeratorValue": {
"type": "keyword"
},
"strengthVal": {
"type": "keyword"
},
"unitOfMeasure": {
"type": "keyword"
}
}
},
"invalidEntry": {
"type": "boolean"
},
"pitId": {
"type": "integer"
},
"ppaCodes": {
"type": "nested",
"properties": {
"code": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"proprietaryNames": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"name": {
"type": "text",
"ignore_above": 256,
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "autocomplete_index",
"search_analyzer": "autocomplete_search"
},
"nameBasisCode": {
"type": "keyword"
},
"nameChangeCode": {
"type": "keyword"
},
"nameType": {
"type": "integer"
},
"nameTypeDescription": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"qpuUomCde": {
"type": "keyword"
},
"qpuVal": {
"type": "keyword"
},
"qtyUomCde": {
"type": "keyword"
},
"qtyVal": {
"type": "keyword"
},
"snomedCodes": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"ppaNextNo": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"snomed": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"snomedDescriptions": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"ppaNextNo": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"snomed": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"suppliers": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"id": {
"type": "long"
},
"names": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "autocomplete_index",
"search_analyzer": "autocomplete_search"
},
"nameBasisCode": {
"type": "keyword"
},
"nameChangeCode": {
"type": "keyword"
},
"nameType": {
"type": "integer"
},
"nameTypeDescription": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
},
"udfs": {
"type": "nested",
"properties": {
"ddIndicator": {
"type": "integer"
},
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"udfsUomCode": {
"type": "keyword"
},
"udfsValue": {
"type": "keyword"
},
"vmpUomCode": {
"type": "keyword"
}
}
},
"vmpId": {
"type": "long"
},
"vmppId": {
"type": "long"
},
"vtms": {
"type": "nested",
"properties": {
"endDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
},
"id": {
"type": "long"
},
"startDate": {
"type": "date",
"format": "uuuu-MM-dd'T'HH:mm:ss.SSSSSSZ"
}
}
}
}
}
}
}
Edit: Added link to full query - https://pastebin.com/DEJvP2uS
Edit: Settings for index:
{
"index": {
"max_ngram_diff": "20",
"analysis": {
"filter": {
"autocomplete_suffix_filter": {
"type": "ngram",
"min_gram": "1",
"max_gram": "20"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete_index": {
"filter": [
"lowercase",
"autocomplete_filter",
"autocomplete_suffix_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_search": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1"
}
}
This must be happening due to the custom analyzer which you have on your genericNames.name field, you have different custom analyzer, index time you are using the autocomplete_index and search time autocomplete_search analyzer, but the definition of these analyzers is not provided in the question, only mapping part is provided.
Please provide the output of _setting API on your index, refer https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-get-settings.html for more info.
You need to check the tokens generated for phenoxymethylpenicillin using the analyze API for both autocomplete_index and autocomplete_search analyzer and you will notice the difference.
In the index mapping provided above, genericNames is of the nested type so you need to use nested query
Adding a working example using the same index data as provided above along with search query and search result.
Search Query:
{
"query": {
"nested": {
"path": "genericNames",
"query": {
"bool": {
"must": [
{
"match": {
"genericNames.name": "phenoxymethylpenicillin"
}
}
]
}
},
"inner_hits":{}
}
}
}
Search Result:
"hits": [
{
"_index": "64817981",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "genericNames",
"offset": 0
},
"_score": 0.7361701,
"_source": {
"nameType": 1,
"name": "Phenoxymethylpenicillin 250mg tablets",
"nameChangeCode": "0000",
"nameBasisCode": "0001",
"nameTypeDescription": "Name",
"startDate": "1948-01-01T00:00:00.000000+0000",
"endDate": "3456-02-01T00:00:00.000000+0000"
}
}
]

Unable to apply new index template

I am currently trying to update an index template on Elastic Search 6.7/6.8.
Templates are stored in the code and are applied each time my API starts.
There are no errors, the request returns 200.
For example, here is a template i am currently using:
{
"index_patterns": [ "*-ec2-reports" ],
"version": 11,
"mappings": {
"ec2-report": {
"properties": {
"account": {
"type": "keyword"
},
"reportDate": {
"type": "date"
},
"reportType": {
"type": "keyword"
},
"instance": {
"properties": {
"id": {
"type": "keyword"
},
"region": {
"type": "keyword"
},
"state": {
"type": "keyword"
},
"purchasing": {
"type": "keyword"
},
"keyPair": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"platform": {
"type": "keyword"
},
"tags": {
"type": "nested",
"properties": {
"key": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"costs": {
"type": "object"
},
"stats": {
"type": "object",
"properties": {
"cpu": {
"type": "object",
"properties": {
"average": {
"type": "double"
},
"peak": {
"type": "double"
}
}
},
"network": {
"type": "object",
"properties": {
"in": {
"type": "double"
},
"out": {
"type": "double"
}
}
},
"volumes": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"read": {
"type": "double"
},
"write": {
"type": "double"
}
}
}
}
},
"recommendation": {
"type": "object",
"properties": {
"instancetype": {
"type": "keyword"
},
"reason": {
"type": "keyword"
},
"newgeneration": {
"type": "keyword"
}
}
}
}
}
},
"_all": {
"enabled": false
},
"numeric_detection": false,
"date_detection": false
}
}
}
I'd like to add a new keyword field under the properties object like this :
"exampleField": {
"type": "keyword"
}
but it seems the template is not applied to existing indexes.
When data is inserted into a specific index which use the template, it is stored like this:
"exampleField": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
because the template has not been updated beforehand.
I would expect it to be like:
"exampleField": {
"type": "keyword"
}
in the index and in the template.
Does someone have any idea on how to have this result?
Thank you, Alexandre.

Why does the keyword type take up much more space than text in elasticsearch?

env: ElasticSearch 5.5.1
First there are two indexs in my elasticsearch
and the only different of two index is the message field, the field's type of message in index1 is keyword, and in index2 is text.
To ensure that it is not affected by other fields,I remove the message field and compare before and after result:
Before remove message field:
after remove message field i got:
Obvious the message field takes up a lot of space,and the type of keyword take up much more than text,but I don't know why keyword take up much more size than text?
so, is there anyone help me ?
Following is the index of index1's mapping info:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"#agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"#timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
and settings info:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "3",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index1",
"creation_date": "1531389785215",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "zd8oVbwUQbys1UJ8hJZRmQ",
"version": {
"created": "5050099"
}
}
}
Following is the index of index2's mapping info:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"#agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"#timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"CommunicationReturnCode": {
"type": "keyword"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"action": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "text"
},
"msgid": {
"type": "keyword"
},
"msgname": {
"type": "keyword"
},
"nodetype": {
"type": "keyword"
},
"orgid": {
"type": "keyword"
},
"orgname": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"processingState": {
"type": "keyword"
},
"processingStatecode": {
"type": "keyword"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"thread": {
"type": "keyword"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"transDescription": {
"type": "keyword"
},
"transactionErrorCode": {
"type": "keyword"
},
"transactionTimeConsuming": {
"type": "keyword"
},
"transcode": {
"type": "keyword"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
and setting info:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "2",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index2",
"creation_date": "1531467294314",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "yROU2MrMTzip4VXH_zWEXQ",
"version": {
"created": "5050099"
}
}
}
Following are one of the index's file structure of the two shards about the text type field:
and the keyword type field:
And you can believe that there are same number of documents in two folder, and the only difference of the field is the type of message field.
Could you explain it?
Thank you so much!
In Elasticsearch keyword fields have doc_values enabled by default, while text fields does not. This means that on your keyword fields it will store the whole field in a column-oriented fashion, in order to be able to perform aggregations or sorting, without relying on fielddata.
Also, Once you tokenize a string, with stemming, lowercasing, etc, you can achieve much better compression.
You can try to disable doc_values on that field if you don't perform aggregations or sorting on it.

Resources