Elasticsearch Aggregation (scripted metric aggregation) - elasticsearch

I have product collection with contains array of categories.
What i need is aggregation(?) - an array of distinct category_id's.
Also nice to have - count of matching product number per category_id.
Sample index data
{
"data": [
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4257,
"max_score": 1.0,
"hits": [
{
"_index": "index_name",
"_type": "product",
"_id": "4",
"_score": 1.0,
"_source": {
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4"
},
"category": [
{
"category_id": 2,
"is_virtual": "false"
},
{
"category_id": 3,
"is_parent": true,
"is_virtual": "false",
"name": "Category name1"
},
{
"category_id": 4,
"is_parent": true,
"is_virtual": "false",
"name": "Category name2"
},
{
"category_id": 7,
"is_parent": true,
"is_virtual": "false",
"name": "Category name3"
}
]
}
]
}
}
]
}
What i need is array of category ids, sorted by product count and possibility to filter categories ("is_virtual" or "is_parent"):
Expected result
{
"data": [
{
"categories": [
{
"category_id": 2,
"doc_count": 555
},
{
"category_id": 3,
"doc_count": 150
},
{
"category_id": 56,
"doc_count": 12
}
]
}
]
}
Index mapping
{
"index": {
"mappings": {
"product": {
"_all": {
"enabled": false
},
"properties": {
"EAN01": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN02": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN03": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN04": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN05": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN06": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN07": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN08": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN09": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN10": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN11": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN12": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN13": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN14": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN15": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN16": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN17": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN18": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN19": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN20": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"activity": {
"type": "integer"
},
"attribute_set1_1": {
"type": "integer"
},
"attribute_set_id": {
"type": "integer"
},
"attributeset2attribute1": {
"type": "integer"
},
"attributeset3attribute1": {
"type": "integer"
},
"autocomplete": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"analyzer": "standard"
},
"belongs_to_catalog": {
"type": "boolean"
},
"ca_1_1243545189": {
"type": "integer"
},
"category": {
"type": "nested",
"properties": {
"category_id": {
"type": "integer"
},
"is_blacklisted": {
"type": "boolean"
},
"is_parent": {
"type": "boolean"
},
"is_virtual": {
"type": "boolean"
},
"name": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"position": {
"type": "integer"
}
}
},
"category_gear": {
"type": "integer"
},
"children_attributes": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"children_ids": {
"type": "integer"
},
"color": {
"type": "integer"
},
"configurable_attributes": {
"type": "keyword"
},
"configurable_variation": {
"type": "integer"
},
"created_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
},
"custom_price": {
"type": "keyword",
"ignore_above": 256
},
"description": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"eco_collection": {
"type": "boolean"
},
"entity_id": {
"type": "integer"
},
"erin_recommends": {
"type": "boolean"
},
"features_bags": {
"type": "integer"
},
"gender": {
"type": "integer"
},
"has_options": {
"type": "boolean"
},
"image": {
"type": "keyword",
"ignore_above": 256
},
"indexed_attributes": {
"type": "keyword"
},
"manufacturer": {
"type": "integer"
},
"material": {
"type": "integer"
},
"mycolor": {
"type": "integer"
},
"mysize": {
"type": "integer"
},
"name": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"sortable": {
"type": "text",
"analyzer": "sortable",
"fielddata": true
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"new": {
"type": "boolean"
},
"option_text_activity": {
"type": "keyword",
"ignore_above": 256
},
"option_text_attribute_set1_1": {
"type": "keyword",
"ignore_above": 256
},
"option_text_attributeset2attribute1": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_attributeset3attribute1": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_belongs_to_catalog": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_ca_1_1243545189": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_category_gear": {
"type": "keyword",
"ignore_above": 256
},
"option_text_color": {
"type": "keyword",
"ignore_above": 256
},
"option_text_configurable_variation": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_eco_collection": {
"type": "keyword",
"ignore_above": 256
},
"option_text_erin_recommends": {
"type": "keyword",
"ignore_above": 256
},
"option_text_features_bags": {
"type": "keyword",
"ignore_above": 256
},
"option_text_gender": {
"type": "keyword",
"ignore_above": 256
},
"option_text_manufacturer": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_material": {
"type": "keyword",
"ignore_above": 256
},
"option_text_mycolor": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_mysize": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_new": {
"type": "keyword",
"ignore_above": 256
},
"option_text_performance_fabric": {
"type": "keyword",
"ignore_above": 256
},
"option_text_sale": {
"type": "keyword",
"ignore_above": 256
},
"option_text_size": {
"type": "keyword",
"ignore_above": 256
},
"option_text_status": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_strap_bags": {
"type": "keyword",
"ignore_above": 256
},
"option_text_style_bags": {
"type": "keyword",
"ignore_above": 256
},
"option_text_tax_class_id": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"performance_fabric": {
"type": "boolean"
},
"price": {
"type": "nested",
"properties": {
"customer_group_id": {
"type": "integer"
},
"final_price": {
"type": "double"
},
"is_discount": {
"type": "boolean"
},
"max_price": {
"type": "double"
},
"min_price": {
"type": "double"
},
"original_price": {
"type": "double"
},
"price": {
"type": "double"
},
"tax_class_id": {
"type": "integer"
}
}
},
"required_options": {
"type": "boolean"
},
"sale": {
"type": "boolean"
},
"search": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"analyzer": "standard"
},
"search_query": {
"type": "nested",
"properties": {
"is_blacklisted": {
"type": "boolean"
},
"position": {
"type": "integer"
},
"query_id": {
"type": "integer"
}
}
},
"short_description": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"size": {
"type": "integer"
},
"sku": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"sortable": {
"type": "text",
"analyzer": "sortable",
"fielddata": true
},
"untouched": {
"type": "keyword",
"ignore_above": 256
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "reference"
},
"spelling": {
"type": "text",
"fields": {
"phonetic": {
"type": "text",
"analyzer": "phonetic"
},
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"analyzer": "standard"
},
"status": {
"type": "integer"
},
"stock": {
"properties": {
"is_in_stock": {
"type": "boolean"
},
"qty": {
"type": "integer"
}
}
},
"strap_bags": {
"type": "integer"
},
"style_bags": {
"type": "integer"
},
"tax_class_id": {
"type": "integer"
},
"type_id": {
"type": "keyword"
},
"updated_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
},
"url_key": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"visibility": {
"type": "integer"
}
}
}
}
}
}

Adding a working example with index data, mapping, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"category": {
"type": "nested"
}
}
}
}
Index Data:
{
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4",
"category": [
{
"category_id": 2,
"is_virtual": "false"
},
{
"category_id": 3,
"is_parent": true,
"is_virtual": "false",
"name": "Category name3"
}
]
}
{
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4",
"category": [
{
"category_id": 2,
"is_parent": true,
"is_virtual": "false",
"name": "Category name1"
}
]
}
{
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4",
"category": [
{
"category_id": 2,
"is_virtual": "false"
},
{
"category_id": 3,
"is_parent": true,
"is_virtual": "false",
"name": "Category name1"
},
{
"category_id": 4,
"is_parent": true,
"is_virtual": "false",
"name": "Category name2"
}
]
}
Search Query:
{
"size": 0,
"aggs": {
"nested_path": {
"nested": {
"path": "category"
},
"aggs": {
"distinct_categories": {
"terms": {
"field": "category.category_id"
},
"aggs": {
"top_category_hits": {
"top_hits": {
"_source": {
"includes": [
"category.category_id"
]
}
}
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"nested_path": {
"doc_count": 6,
"distinct_categories": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 3, <-- note this
"top_category_hits": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "category",
"offset": 0
},
"_score": 1.0,
"_source": {
"category_id": 2
}
},
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "3",
"_nested": {
"field": "category",
"offset": 0
},
"_score": 1.0,
"_source": {
"category_id": 2
}
},
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "category",
"offset": 0
},
"_score": 1.0,
"_source": {
"category_id": 2 <-- note this
}
}
]
}
}
},
{
"key": 3,
"doc_count": 2, <-- note this
"top_category_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "category",
"offset": 1
},
"_score": 1.0,
"_source": {
"category_id": 3
}
},
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "category",
"offset": 1
},
"_score": 1.0,
"_source": {
"category_id": 3 <-- note this
}
}
]
}
}
},
{
"key": 4,
"doc_count": 1, <-- note this
"top_category_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "category",
"offset": 2
},
"_score": 1.0,
"_source": {
"category_id": 4 <-- note this
}
}
]
}
}
}
]
}

Related

How to remove stop words with attachment processor in Elastisearch?

I have to following index template:
PUT _index_template/aclimdb_1
{
"index_patterns": [
"aclimdb-*"
],
"template": {
"settings": {
"number_of_shards": 1,
"index": {
"max_result_window": "25000"
}
},
"mappings": {
"_source": {
"enabled": true
},
"properties": {
"attachment.content": {
"type": "text",
"analyzer": "english"
},
"label": {
"type": "integer"
},
"class": {
"type": "integer"
}
}
}
},
"version": 1,
"priority": 500,
"composed_of": [
],
"_meta": {
"description": "aclImdb index"
}
}
and the following mappings:
{
"mappings": {
"_doc": {
"properties": {
"attachment": {
"properties": {
"content": {
"type": "text",
"analyzer": "english"
},
"content_length": {
"type": "long"
},
"content_type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"language": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"class": {
"type": "integer"
},
"data": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"filename": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"label": {
"type": "integer"
}
}
}
}
}
and the following ingest pipeline:
[
{
"attachment": {
"field": "data"
}
},
{
"html_strip": {
"field": "attachment.content"
}
}
]
I would like to remove the stop words from the terms list when I upload the documents.How can I do that?
Do I have to do it in the ingest pipeline or in the settings of the index template?
Thank you very much for your help
Edit:
Thanks to the answer below I found a way to solve the problem. Here is my new index template.
PUT _index_template/aclimdb_1
{
"index_patterns": [
"aclimdb-*"
],
"template": {
"settings": {
"number_of_shards": 1,
"index": {
"max_result_window": "25000"
},
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"english_analyzer": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_stemmer"
]
}
}
}
},
"mappings": {
"_source": {
"enabled": true
},
"properties": {
"attachment.content": {
"type": "text",
"analyzer": "english_analyzer"
},
"label": {
"type": "integer"
},
"class": {
"type": "integer"
}
}
}
},
"version": 1,
"priority": 500,
"composed_of": [],
"_meta": {
"description": "aclImdb index"
}
}
Is there a cleaner way how I could achieve the same result?
Now I have the next challenge. I would like to filter out all numeric values. How would a filter to achieve that look like?
You can create a field with the analyzer stop word english. Then, when you index your token not have the stop words.
Read more about stop words filter.
Example:
PUT idx_stop_words
{
"settings": {
"analysis": {
"filter": {
"english_stop_filter": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"english_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"english_stop_filter"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "english_analyzer"
}
}
}
}
GET idx_stop_words/_analyze
{
"field": "content",
"text": "the woman in the window"
}
Token:
{
"tokens": [
{
"token": "woman",
"start_offset": 4,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "window",
"start_offset": 17,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 4
}
]
}

Elasticsearch query for all values of field with group by

i am having trouble forming query to fetch all values with sql group by kind of thing.
so below is my data structure:
product index:
{
"createdBy" : "61c1fcdd88dbad1920da8caf",
"creationTime" : "2021-12-22T11:58:53.576932Z",
"lastModifiedBy" : "61c1fcdd88dbad1920da8caf",
"lastModificationTime" : "2021-12-22T11:58:53.576932Z",
"id" : "61c312fdc6aa620a609db0b2",
"title" : "string",
"brand" : "string",
"longDesc" : "string",
"categoryId" : "string",
"imageUrls" : [
"string",
"string"
],
"keySpecs" : [
"string",
"string",
],
"facets" : [
{
"name" : "color",
"value" : "red"
},
{
"name" : "storage",
"value" : "16 GB"
},
{
"name" : "brand",
"value" : "Intex"
}
],
"categoryName" : "handsets"
}
Now, i want to fetch all the facets with their different values and count as well. Let's say
productA has color blue, productB has color red
productA has brand ABC, productB has brand XYZ
so, i want data which list all facets like:
color: blue(200 count), red (12 count)
brand: ABC(13 count), XYZ (99 count)
Also, different product will have different type of facet, like iphone will have color memory brand size, but a pen will have color and brand only (not memory/size).
Note: i'm using latest version of elastic
=================
UPDATE 1:
Below is the es mapping details
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": [
"example"
]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"lalashree_standard_analyzer": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
},
"html_standard_analyzer": {
"char_filter": [
"html_strip"
],
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "keyword"
},
"createdBy": {
"type": "keyword"
},
"creationTime": {
"type": "date"
},
"lastModifiedBy": {
"type": "keyword"
},
"lastModificationTime": {
"type": "date"
},
"deleted": {
"type": "boolean"
},
"deletedBy": {
"type": "keyword"
},
"deletionTime": {
"type": "date"
},
"title": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"suggest": {
"type": "completion"
}
}
},
"shortDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"longDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"categoryId": {
"type": "keyword"
},
"searchDetails": {
"type": "object",
"properties": {
"desc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"keywords": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"imageUrls": {
"type": "keyword",
"index": false
},
"keySpecs": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"sections": {
"type": "object",
"properties": {
"name": {
"type": "text",
"index": false
},
"shortDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"longDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"htmlContent": {
"type": "text",
"analyzer": "html_standard_analyzer"
}
}
},
"facets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"specificationItems": {
"type": "object",
"properties": {
"key": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"values": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
}
}
},
"categoryName": {
"type": "keyword"
},
"productFamily": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"familyVariantOptions": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"values": {
"type": "keyword"
}
}
},
"productFamilyItems": {
"type": "nested",
"properties": {
"baseProductId": {
"type": "keyword"
},
"itemVariantInfoSet": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
}
}
}
},
"rating": {
"type": "float"
},
"totalReviewsCount": {
"type": "long"
},
"stores": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"logo": {
"type": "keyword",
"index": false
},
"active": {
"type": "boolean"
},
"name": {
"type": "text"
},
"quantity": {
"type": "long"
},
"rating": {
"type": "float"
},
"totalReviewsCount": {
"type": "long"
},
"price.mrp": {
"type": "float"
},
"price.sp": {
"type": "float"
},
"location.geoPoint": {
"type": "geo_point"
},
"oos": {
"type": "boolean"
}
}
}
}
}
}
This query first group by names then groups each name's values. By setting sizes, you can arrange number of facets you want and number of items in each facet. I think it does what you need.
Note that if you have too many documents and if performance matters, this query may perform bad.
{
"size": 0,
"aggs": {
"facets": {
"nested": {
"path": "facets"
},
"aggs": {
"names": {
"terms": {
"field": "facets.name",
"size": 10
},
"aggs": {
"values": {
"terms": {
"field": "facets.value",
"size": 10
}
}
}
}
}
}
}
}

ElasticSearch query relevance

I would like to find a product with the search priority : pickRef, name, synonym (it's an array) and the others after. I don"t succeed to have a working query.. I have to boost synonym with "50" in order to have the product in top 8 results...
The aim of my query is to make an autocompletion search with fuzzy (to avoid mispelling)
I have a product with the synonym "caca" When I want to search "caca" ES return every coca products. but not the product with the synonym "caca". However, the term "caca" must be the first result beceause it match perfectly with synonym field and coca products must come after (due to fuzzy parameter)
There is my index :
{
"product": {
"aliases": {},
"mappings": {
"properties": {
"brand": {
"type": "keyword",
"boost": 3
},
"catalogue": {
"type": "keyword"
},
"category": {
"type": "text",
"analyzer": "standard"
},
"description": {
"properties": {
"de": {
"type": "text",
"boost": 3,
"analyzer": "german"
},
"en": {
"type": "text",
"boost": 3,
"analyzer": "english"
},
"fr": {
"type": "text",
"boost": 3,
"analyzer": "french"
},
"lu": {
"type": "text",
"boost": 3
}
}
},
"description_ecology": {
"properties": {
"de": {
"type": "text",
"boost": 3,
"analyzer": "german"
},
"en": {
"type": "text",
"boost": 3,
"analyzer": "english"
},
"fr": {
"type": "text",
"boost": 3,
"analyzer": "french"
},
"lu": {
"type": "text",
"boost": 3
}
}
},
"enabled": {
"type": "boolean"
},
"image": {
"type": "text"
},
"name": {
"properties": {
"de": {
"type": "text",
"boost": 3,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "german"
},
"en": {
"type": "text",
"boost": 3,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "english"
},
"fr": {
"type": "text",
"boost": 3,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "french"
},
"lu": {
"type": "text",
"boost": 3,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"pickRef": {
"type": "keyword",
"boost": 5
},
"replaced": {
"type": "boolean"
},
"slug": {
"type": "text"
},
"synonym": {
"type": "keyword",
"boost": 3
}
}
},
"settings": {
"index": {
"routing": {
"allocation": {
"include": {
"_tier_preference": "data_content"
}
}
},
"number_of_shards": "1",
"provided_name": "product",
"creation_date": "1634287857507",
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
}
},
"char_filter": {
"pre_negs": {
"pattern": "a \\w",
"type": "pattern_replace",
"replacement": ""
}
}
},
"number_of_replicas": "0",
"uuid": "EGLmpv8bRlCnfLBxHZOKmA",
"version": {
"created": "7150099"
}
}
}
}
}
There is my query :
{
"index": "product",
"size": 8,
"body": {
"query": {
"bool": {
"must": [
{
"match": {
"enabled": true
}
},
{
"match": {
"replaced": false
}
}
],
"should": [
{
"match": {
"name.fr": {
"query": "caca",
"analyzer": "standard"
}
}
},
{
"match": {
"synonym": {
"query": "caca",
"boost": 20,
"analyzer": "standard"
}
}
},
{
"multi_match": {
"query": "caca",
"fields": [
"brand^2",
"pickRef^5",
"catalogue",
"name.fr^3",
"name.en^1",
"name.de^1",
"name.lu^1",
"description.fr^1",
"description.en^1",
"description.de^1",
"description.lu^1",
"description_ecologique.fr^1",
"description_ecologique.en^1",
"description_ecologique.de^1",
"description_ecologique.lu^1"
],
"fuzziness": "AUTO"
}
},
{
"query_string": {
"query": "caca"
}
}
]
}
}
}
}
Those are my products :
{
"_index": "product",
"_type": "_doc",
"_id": "1594",
"_version": 1,
"_seq_no": 1593,
"_primary_term": 1,
"found": true,
"_source": {
"name": {
"fr": "PLANTE ARTIFICIELLE BAMBOU 120cm"
},
"pickRef": "122638",
"description": {
"fr": "Agrémentez votre lieu de travail avec cette superbe plante ! Elle garantit un environnement très naturel, ne nécessite pas d'entretien et agrémente n'importe quel espace. Tronc en bois, feuillage en polyester , livrée dans un pot standard en plastique."
},
"description_ecology": {
"fr": ""
},
"catalogue": "P399",
"image": "uploads/product/122638/122638.png",
"brand": "PAPERFLOW",
"category": "Autres",
"slug": "plante-artificielle-bambou-120cm-122638-122638",
"enabled": true,
"synonym": [],
"replaced": false
}
}
{
"_index": "product",
"_type": "_doc",
"_id": "3131",
"_version": 1,
"_seq_no": 3130,
"_primary_term": 1,
"found": true,
"_source": {
"name": {
"fr": "ROYCO MINUTE SOUP \"POIS AU JAMBON\""
},
"pickRef": "141065",
"description": {
"fr": "Retrouvez le bon goût des légumes dans ces recettes de tradition alliant tout le savoir-faire de Royco Minute Soup à la saveur des meilleurs ingrédients."
},
"description_ecology": {
"fr": ""
},
"catalogue": "P038",
"image": "uploads/product/141065/141065.png",
"brand": "ROYCO",
"category": "Soupe & pâtes",
"slug": "royco-minute-soup-pois-au-jambon-5410056186552-141065",
"enabled": true,
"synonym": [],
"replaced": false
}
}
{
"_index": "product",
"_type": "_doc",
"_id": "6",
"_version": 2,
"_seq_no": 24511,
"_primary_term": 1,
"found": true,
"_source": {
"name": {
"fr": "AGRAFES 26/6 GALVANISEES"
},
"pickRef": "100110",
"description": {
"fr": "<div>Boîte de 1000 agrafes 26/6 galvanisées.</div>"
},
"description_ecology": {
"fr": null
},
"catalogue": "S",
"image": "uploads/product/233163/233163.png",
"brand": "autres",
"category": "Autres",
"slug": "agrafes-26-6-galvanisees-jambon-5010255827746-100110",
"enabled": true,
"synonym": [
"caca",
"jambon"
],
"replaced": false
}
}
PS : I know the example is not perfect but I don't have a better one...
do you try to sort by _score?
{
"index": "product",
"size": 8,
"body": {
"query": {
.
.
.
},
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}
}

Es index rate become slow after create index mapping

I write data using ES BulkProcessor(I tried python script, storm es-bolt, flink es-sink), but the index rate is so slow after create index mapping.
Situation 1: Leave all index settings as its default, index rate can reach about 10000+.
Situation 2: Just create index mapping, index rate fall to 3000.
I use the same data, same code, same machines.
result
flink es-sink write json data to es:
My data
repeat write the same data below(the message field is the raw log, it's about 7KB size, the delete some content for exceeding the question limit):
{
"_index": "nyc_flink_test997",
"_type": "doc",
"_id": "k8uS92cBOH4ugSIjCzmn",
"_score": 1,
"_source": {
"exception": "false",
"log_id": "8F71AF1606EE46BFA9D57AA2282D8596",
"offset": "2368",
"message_length": "2103",
"level": "INFO",
"source": "/opt/hadoop/elastic-stack/s_login/Gusermanager.usermanager.s_login.20.log",
"sessionid": "provider-60-2883b4bd3ff2b",
"associate_id": "33d081b83a0654a2",
"message": """
[16:41:33.376][I][ec4edfe0b2584b73]log start:53F9A1A1E71044E281755E930E1B004C
[16:41:33.376][T][ec4edfe0b2584b73]入参0=__REQ__
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:4119)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2570)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2731)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2815)
at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:2155)
at com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:2322)
at cn.com.agree.addal.cp.ProxyPreparedStatement.executeQuery(ProxyPreparedStatement.java:46)
at tc.bank.aesb.mbs.MBS_DBIMPL.PyDBGetSel(MBS_DBIMPL.java:1624)
at tc.bank.aesb.mbs.MBS_DBIMPL.PyDBExecOneSQL(MBS_DBIMPL.java:466)
at tc.bank.aesb.mbs.MBS_DBIMPL.PyDBExecGrpSQL(MBS_DBIMPL.java:123)
at tc.bank.aesb.mbs.B_MBS_DataBase.B_DBUnityRptOpr(B_MBS_DataBase.java:121)
at CUST.CustomerInfoQry.TCustomerInfoQry$Step1$Node4.execute(TCustomerInfoQry.java:200)
at CUST.CustomerInfoQry.TCustomerInfoQry$Step1.execute(TCustomerInfoQry.java:113)
at CUST.CustomerInfoQry.TCustomerInfoQry.execute(TCustomerInfoQry.java:76)
at cn.com.agree.afa.svc.javaengine.JavaEngine.execute(JavaEngine.java:237)
at cn.com.agree.afa.svc.handler.TradeHandler.handle(TradeHandler.java:62)
[16:41:33.414][I][ec4edfe0b2584b73]log end:53F9A1A1E71044E281755E930E1B004C
""",
"exec_ip": "10.88.188.167",
"start_time": "2018-12-09 16:46:14.764",
"group_v2": "Gusermanager",
"script_exec_time": "1",
"trade_exec_time": "2"
}
}
index mapping
{
"mappings": {
"doc":{
"dynamic_templates": [
{
"string_fields": {
"match": "*",
"match_mapping_type": "string",
"mapping": {
"type": "text",
"norms": false,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
],
"properties": {
"#timestamp": {
"type": "date"
},
"#version": {
"type": "keyword"
},
"geoip": {
"dynamic": true,
"properties": {
"ip": {
"type": "ip"
},
"location": {
"type": "geo_point"
},
"latitude": {
"type": "half_float"
},
"longitude": {
"type": "half_float"
}
}
},
"exception": {
"type": "boolean"
},
"message":{
"type":"text",
"norms": false,
"analyzer": "ik_max_word"
},
"associate_id": {
"type": "text",
"analyzer": "ik_max_word"
},
"end_time": {
"type": "date",
"format": "date_time||yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd||epoch_millis||HH:mm:ss.SSS"
},
"start_time": {
"type": "date",
"format": "date_time||yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd||epoch_millis||HH:mm:ss.SSS"
},
"exec_ip": {
"type": "ip"
},
"level": {
"type": "keyword"
},
"script_exec_time": {
"type": "long"
},
"trade_exec_time": {
"type": "long"
},
"sessionid": {
"type": "text",
"analyzer": "ik_max_word"
},
"log_id": {
"type": "text",
"analyzer": "ik_max_word"
},
"discard_time": {
"type": "long"
},
"scene_code": {
"type": "text",
"analyzer": "ik_max_word"
},
"service_code": {
"type": "text",
"analyzer": "ik_max_word"
},
"group": {
"type": "text"
},
"group_v2" :{
"type": "text",
"analyzer": "ik_max_word"
},
"message_length":{
"type": "long"
},
"log_filename":{
"type": "text",
"analyzer": "ik_max_word"
},
"ingest_time":{
"type": "date"
}
}
}
}
}
I tried writing with python scirpts, storm es-bolt, the result is same, index rate falls after create index mapping. Can anyone give some ideas about it. Thanks in advance.

Elasticsearch Sorting fields anomaly

Trying to sort a list on certain fields. firstName and lastName but I have noticed some inconstant result.
I am running a simple query
//Return all the employees from a specific company ordering by lastName asc | desc
GET employee-index-sorting
{
"query": {
"bool": {
"filter": {
"term": {
"companyId": 3179
}
}
}
},
"sort": [
{
"lastName.keyword": { <-- Should this be keyword? or not_analyzed
"order": "desc"
}
}
]
}
In the result why would van der Mescht and van Breda be before Zwane and Zwezwe?
I suspect there is something wrong with my mappings
{
"_index": "employee-index",
"_type": "_doc",
"_id": "637467",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name",
"lastName": "van der Mescht",
},
"sort": [
"van der Mescht"
]
},
{
"_index": "employee-index",
"_type": "_doc",
"_id": "678335",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name3",
"lastName": "van Breda",
},
"sort": [
"van Breda"
]
},
{
"_index": "employee-index",
"_type": "_doc",
"_id": "113896",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name2",
"lastName": "Zwezwe",
},
"sort": [
"Zwezwe"
]
},
{
"_index": "employee-index",
"_type": "_doc",
"_id": "639639",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name1",
"lastName": "Zwane",
},
"sort": [
"Zwane"
]
}
Mappings
Posting the entire map because I am not sure if there might be something else wrong with it.
How should i change the lastName and firstName propery to allow for sorting on them?
PUT employee-index-sorting
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"employeeId": {
"type": "keyword"
},
"companyGroupId": {
"type": "keyword"
},
"companyId": {
"type": "keyword"
},
"number": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"preferredName": {
"type": "text",
"index": false
},
"firstName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"middleName": {
"type": "text",
"index": false
},
"lastName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"fullName": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
}
},
"analyzer": "standard"
},
"terminationDate": {
"type": "date"
},
"companyName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"email": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"idNumber": {
"type": "text"
},
"description": {
"type": "text",
"index": false
},
"jobNumber": {
"type": "keyword"
},
"frequencyId": {
"type": "long"
},
"frequencyCode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"frequencyAccess": {
"type": "boolean"
}
}
}
}
}
For sorting you need to use lastName.keyword, that's correct, no need to change anything there.
The reason why van der Mescht and van Breda are before Zwane and Zwezwe is because sorting on strings happens on a lexicographical level, i.e. basically using the ASCII table and uppercase characters happen before lowercase ones, so words are sorted in that same order. But since you're sorting in desc mode, that's exactly the opposite:
z...
...
van der Mescht
...
van Breda
...
a...
...
Zwezwe
...
Zwane
...
Z...
...
A...
To fix this, what you simply need to do is to add a normalizer to your lastName.keyword field, i.e. change your mapping to this and it will work:
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
...
},
"tokenizer": {
...
},
"normalizer": { <-- add this
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
}
},
"mappings": {
"_doc": {
"properties": {
...
"lastName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"normalizer": "lowersort", <-- add this
"ignore_above": 256
}
}
},
...
}
}
}
}

Resources