I have product collection with contains array of categories.
What i need is aggregation(?) - an array of distinct category_id's.
Also nice to have - count of matching product number per category_id.
Sample index data
{
"data": [
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4257,
"max_score": 1.0,
"hits": [
{
"_index": "index_name",
"_type": "product",
"_id": "4",
"_score": 1.0,
"_source": {
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4"
},
"category": [
{
"category_id": 2,
"is_virtual": "false"
},
{
"category_id": 3,
"is_parent": true,
"is_virtual": "false",
"name": "Category name1"
},
{
"category_id": 4,
"is_parent": true,
"is_virtual": "false",
"name": "Category name2"
},
{
"category_id": 7,
"is_parent": true,
"is_virtual": "false",
"name": "Category name3"
}
]
}
]
}
}
]
}
What i need is array of category ids, sorted by product count and possibility to filter categories ("is_virtual" or "is_parent"):
Expected result
{
"data": [
{
"categories": [
{
"category_id": 2,
"doc_count": 555
},
{
"category_id": 3,
"doc_count": 150
},
{
"category_id": 56,
"doc_count": 12
}
]
}
]
}
Index mapping
{
"index": {
"mappings": {
"product": {
"_all": {
"enabled": false
},
"properties": {
"EAN01": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN02": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN03": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN04": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN05": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN06": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN07": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN08": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN09": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN10": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN11": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN12": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN13": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN14": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN15": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN16": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN17": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN18": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN19": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"EAN20": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"activity": {
"type": "integer"
},
"attribute_set1_1": {
"type": "integer"
},
"attribute_set_id": {
"type": "integer"
},
"attributeset2attribute1": {
"type": "integer"
},
"attributeset3attribute1": {
"type": "integer"
},
"autocomplete": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"analyzer": "standard"
},
"belongs_to_catalog": {
"type": "boolean"
},
"ca_1_1243545189": {
"type": "integer"
},
"category": {
"type": "nested",
"properties": {
"category_id": {
"type": "integer"
},
"is_blacklisted": {
"type": "boolean"
},
"is_parent": {
"type": "boolean"
},
"is_virtual": {
"type": "boolean"
},
"name": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"position": {
"type": "integer"
}
}
},
"category_gear": {
"type": "integer"
},
"children_attributes": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"children_ids": {
"type": "integer"
},
"color": {
"type": "integer"
},
"configurable_attributes": {
"type": "keyword"
},
"configurable_variation": {
"type": "integer"
},
"created_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
},
"custom_price": {
"type": "keyword",
"ignore_above": 256
},
"description": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"eco_collection": {
"type": "boolean"
},
"entity_id": {
"type": "integer"
},
"erin_recommends": {
"type": "boolean"
},
"features_bags": {
"type": "integer"
},
"gender": {
"type": "integer"
},
"has_options": {
"type": "boolean"
},
"image": {
"type": "keyword",
"ignore_above": 256
},
"indexed_attributes": {
"type": "keyword"
},
"manufacturer": {
"type": "integer"
},
"material": {
"type": "integer"
},
"mycolor": {
"type": "integer"
},
"mysize": {
"type": "integer"
},
"name": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"sortable": {
"type": "text",
"analyzer": "sortable",
"fielddata": true
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"new": {
"type": "boolean"
},
"option_text_activity": {
"type": "keyword",
"ignore_above": 256
},
"option_text_attribute_set1_1": {
"type": "keyword",
"ignore_above": 256
},
"option_text_attributeset2attribute1": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_attributeset3attribute1": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_belongs_to_catalog": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_ca_1_1243545189": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_category_gear": {
"type": "keyword",
"ignore_above": 256
},
"option_text_color": {
"type": "keyword",
"ignore_above": 256
},
"option_text_configurable_variation": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_eco_collection": {
"type": "keyword",
"ignore_above": 256
},
"option_text_erin_recommends": {
"type": "keyword",
"ignore_above": 256
},
"option_text_features_bags": {
"type": "keyword",
"ignore_above": 256
},
"option_text_gender": {
"type": "keyword",
"ignore_above": 256
},
"option_text_manufacturer": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_material": {
"type": "keyword",
"ignore_above": 256
},
"option_text_mycolor": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_mysize": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_new": {
"type": "keyword",
"ignore_above": 256
},
"option_text_performance_fabric": {
"type": "keyword",
"ignore_above": 256
},
"option_text_sale": {
"type": "keyword",
"ignore_above": 256
},
"option_text_size": {
"type": "keyword",
"ignore_above": 256
},
"option_text_status": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"option_text_strap_bags": {
"type": "keyword",
"ignore_above": 256
},
"option_text_style_bags": {
"type": "keyword",
"ignore_above": 256
},
"option_text_tax_class_id": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"performance_fabric": {
"type": "boolean"
},
"price": {
"type": "nested",
"properties": {
"customer_group_id": {
"type": "integer"
},
"final_price": {
"type": "double"
},
"is_discount": {
"type": "boolean"
},
"max_price": {
"type": "double"
},
"min_price": {
"type": "double"
},
"original_price": {
"type": "double"
},
"price": {
"type": "double"
},
"tax_class_id": {
"type": "integer"
}
}
},
"required_options": {
"type": "boolean"
},
"sale": {
"type": "boolean"
},
"search": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"analyzer": "standard"
},
"search_query": {
"type": "nested",
"properties": {
"is_blacklisted": {
"type": "boolean"
},
"position": {
"type": "integer"
},
"query_id": {
"type": "integer"
}
}
},
"short_description": {
"type": "text",
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"size": {
"type": "integer"
},
"sku": {
"type": "text",
"fields": {
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"sortable": {
"type": "text",
"analyzer": "sortable",
"fielddata": true
},
"untouched": {
"type": "keyword",
"ignore_above": 256
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "reference"
},
"spelling": {
"type": "text",
"fields": {
"phonetic": {
"type": "text",
"analyzer": "phonetic"
},
"shingle": {
"type": "text",
"analyzer": "shingle"
},
"whitespace": {
"type": "text",
"analyzer": "whitespace"
}
},
"analyzer": "standard"
},
"status": {
"type": "integer"
},
"stock": {
"properties": {
"is_in_stock": {
"type": "boolean"
},
"qty": {
"type": "integer"
}
}
},
"strap_bags": {
"type": "integer"
},
"style_bags": {
"type": "integer"
},
"tax_class_id": {
"type": "integer"
},
"type_id": {
"type": "keyword"
},
"updated_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
},
"url_key": {
"type": "text",
"fields": {
"untouched": {
"type": "keyword",
"ignore_above": 256
}
},
"copy_to": [
"search",
"spelling"
],
"analyzer": "standard"
},
"visibility": {
"type": "integer"
}
}
}
}
}
}
Adding a working example with index data, mapping, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"category": {
"type": "nested"
}
}
}
}
Index Data:
{
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4",
"category": [
{
"category_id": 2,
"is_virtual": "false"
},
{
"category_id": 3,
"is_parent": true,
"is_virtual": "false",
"name": "Category name3"
}
]
}
{
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4",
"category": [
{
"category_id": 2,
"is_parent": true,
"is_virtual": "false",
"name": "Category name1"
}
]
}
{
"entity_id": "4",
"attribute_set_id": "4",
"type_id": "simple",
"sku": "skuxxx",
"has_options": false,
"required_options": false,
"created_at": "2020-01-29 06:38:52",
"updated_at": "2020-09-04 12:31:00",
"visibility": "4",
"category": [
{
"category_id": 2,
"is_virtual": "false"
},
{
"category_id": 3,
"is_parent": true,
"is_virtual": "false",
"name": "Category name1"
},
{
"category_id": 4,
"is_parent": true,
"is_virtual": "false",
"name": "Category name2"
}
]
}
Search Query:
{
"size": 0,
"aggs": {
"nested_path": {
"nested": {
"path": "category"
},
"aggs": {
"distinct_categories": {
"terms": {
"field": "category.category_id"
},
"aggs": {
"top_category_hits": {
"top_hits": {
"_source": {
"includes": [
"category.category_id"
]
}
}
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"nested_path": {
"doc_count": 6,
"distinct_categories": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 3, <-- note this
"top_category_hits": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "category",
"offset": 0
},
"_score": 1.0,
"_source": {
"category_id": 2
}
},
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "3",
"_nested": {
"field": "category",
"offset": 0
},
"_score": 1.0,
"_source": {
"category_id": 2
}
},
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "category",
"offset": 0
},
"_score": 1.0,
"_source": {
"category_id": 2 <-- note this
}
}
]
}
}
},
{
"key": 3,
"doc_count": 2, <-- note this
"top_category_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "category",
"offset": 1
},
"_score": 1.0,
"_source": {
"category_id": 3
}
},
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "category",
"offset": 1
},
"_score": 1.0,
"_source": {
"category_id": 3 <-- note this
}
}
]
}
}
},
{
"key": 4,
"doc_count": 1, <-- note this
"top_category_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "stof_64259310",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "category",
"offset": 2
},
"_score": 1.0,
"_source": {
"category_id": 4 <-- note this
}
}
]
}
}
}
]
}
Trying to sort a list on certain fields. firstName and lastName but I have noticed some inconstant result.
I am running a simple query
//Return all the employees from a specific company ordering by lastName asc | desc
GET employee-index-sorting
{
"query": {
"bool": {
"filter": {
"term": {
"companyId": 3179
}
}
}
},
"sort": [
{
"lastName.keyword": { <-- Should this be keyword? or not_analyzed
"order": "desc"
}
}
]
}
In the result why would van der Mescht and van Breda be before Zwane and Zwezwe?
I suspect there is something wrong with my mappings
{
"_index": "employee-index",
"_type": "_doc",
"_id": "637467",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name",
"lastName": "van der Mescht",
},
"sort": [
"van der Mescht"
]
},
{
"_index": "employee-index",
"_type": "_doc",
"_id": "678335",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name3",
"lastName": "van Breda",
},
"sort": [
"van Breda"
]
},
{
"_index": "employee-index",
"_type": "_doc",
"_id": "113896",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name2",
"lastName": "Zwezwe",
},
"sort": [
"Zwezwe"
]
},
{
"_index": "employee-index",
"_type": "_doc",
"_id": "639639",
"_score": null,
"_source": {
"companyId": 3179,
"firstName": "Name1",
"lastName": "Zwane",
},
"sort": [
"Zwane"
]
}
Mappings
Posting the entire map because I am not sure if there might be something else wrong with it.
How should i change the lastName and firstName propery to allow for sorting on them?
PUT employee-index-sorting
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"employeeId": {
"type": "keyword"
},
"companyGroupId": {
"type": "keyword"
},
"companyId": {
"type": "keyword"
},
"number": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"preferredName": {
"type": "text",
"index": false
},
"firstName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"middleName": {
"type": "text",
"index": false
},
"lastName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"fullName": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
}
},
"analyzer": "standard"
},
"terminationDate": {
"type": "date"
},
"companyName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"email": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"idNumber": {
"type": "text"
},
"description": {
"type": "text",
"index": false
},
"jobNumber": {
"type": "keyword"
},
"frequencyId": {
"type": "long"
},
"frequencyCode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"frequencyAccess": {
"type": "boolean"
}
}
}
}
}
For sorting you need to use lastName.keyword, that's correct, no need to change anything there.
The reason why van der Mescht and van Breda are before Zwane and Zwezwe is because sorting on strings happens on a lexicographical level, i.e. basically using the ASCII table and uppercase characters happen before lowercase ones, so words are sorted in that same order. But since you're sorting in desc mode, that's exactly the opposite:
z...
...
van der Mescht
...
van Breda
...
a...
...
Zwezwe
...
Zwane
...
Z...
...
A...
To fix this, what you simply need to do is to add a normalizer to your lastName.keyword field, i.e. change your mapping to this and it will work:
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
...
},
"tokenizer": {
...
},
"normalizer": { <-- add this
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
}
},
"mappings": {
"_doc": {
"properties": {
...
"lastName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"normalizer": "lowersort", <-- add this
"ignore_above": 256
}
}
},
...
}
}
}
}
I am trying to make a query with a filter on my index but when I try to filter on any attribute in the mapping the query returns no result.
The query is the following, if I run just with the geo_distance part I get results. I would like to filter the results using one of the properties in the mapping (in this case rating, but it can be city, state ecc).
Query is generated in Java via QueryBuilder from elasticsearch library (v 52.0). But for now I am trying to understand how to build a working query and executing via CURL.
{
"query": {
"bool": {
"filter": [
{
"geo_distance": {
"geometry.coordinates": [
12.3232,
12.2323
],
"distance": 200000,
"distance_type": "plane",
"validation_method": "STRICT",
"ignore_unmapped": false,
"boost": 1
}
},
{
"bool": {
"must": [
{
"terms": {
"rating": [
"0"
],
"boost": 1
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
}
If I run a query filtering on zipcode or id it works.
For example a query like this:
{"query":{"bool":{"filter":{"term":{"zipCode":"111111"}}}}}
A snippet of my mapping is this
{
"my_index": {
"mappings": {
"poielement": {
"dynamic_templates": [
{
"suggestions": {
"match": "suggest_*",
"mapping": {
"analyzer": "my_analyzer",
"copy_to": "auto_suggest",
"search_analyzer": "my_analyzer",
"store": true,
"type": "text"
}
}
},
{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "text"
}
}
},
{
"geopoint": {
"match": "coordinates",
"mapping": {
"type": "geo_point"
}
}
},
{
"property": {
"match": "*",
"mapping": {
"analyzer": "my_analyzer",
"search_analyzer": "my_analyzer"
}
}
}
],
"date_detection": false,
"numeric_detection": false,
"properties": {
"city": {
"type": "text",
"analyzer": "my_analyzer"
},
"country": {
"type": "text",
"analyzer": "my_analyzer"
},
"geometry": {
"properties": {
"coordinates": {
"type": "geo_point"
},
"type": {
"type": "text",
"analyzer": "my_analyzer"
}
}
},
"id": {
"type": "text"
},
"name": {
"type": "keyword"
},
"rating": {
"type": "text"
},
"total_rate": {
"type": "text",
"analyzer": "my_analyzer"
},
"type": {
"type": "text",
"analyzer": "my_analyzer"
},
"zipCode": {
"type": "text"
}
}
}
}
}
}
When I retrieve data via http://elasticsearchpat/my_index/_search data looks like this
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 4,
"successful": 4,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 7517,
"max_score": 1,
"hits": [
{
"_index": "my_index",
"_type": "poielement",
"_id": "58768",
"_score": 1,
"_source": {
"zipCode": 111111,
"country": "USA",
"city": "Portland",
"rating": 0,
"type": "",
"id": 123,
"geometry": {
"coordinates": [
12.205061,
12.490463
],
"type": "Point"
}
}
}
]
}
}
I will be very grateful for any help.
Thanks
Try this query instead
{
"query": {
"bool": {
"must": [
{
"match": {
"rating": 0
}
}
],
"filter": [
{
"geo_distance": {
"geometry.coordinates": [
12.3232,
12.2323
],
"distance": 200000,
"distance_type": "plane",
"validation_method": "STRICT",
"ignore_unmapped": false,
"boost": 1
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
}
I have an index which contains CustomerProfile documents. Each of this document in the CustomerInsightTargets(with the properties Source,Value) property can be an array with x items. What I am trying to achieve is an autocomplete (of top 5) on CustomerInsightTargets.Value grouped by CustomerInisghtTarget.Source.
It will be helpful if anyone gives me hint about how to select only a subset of nested objects from each document and use that nested obj in aggregations.
{
"customerinsights": {
"aliases": {},
"mappings": {
"customerprofile": {
"properties": {
"CreatedById": {
"type": "long"
},
"CreatedDateTime": {
"type": "date"
},
"CustomerInsightTargets": {
"type": "nested",
"properties": {
"CustomerInsightSource": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"CustomerInsightValue": {
"type": "text",
"term_vector": "yes",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "ngram_tokenizer_analyzer"
},
"CustomerProfileId": {
"type": "long"
},
"Guid": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Id": {
"type": "long"
}
}
},
"DisplayName": {
"type": "text",
"term_vector": "yes",
"analyzer": "ngram_tokenizer_analyzer"
},
"Email": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Id": {
"type": "long"
},
"ImageUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
},
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "customerinsights",
"creation_date": "1484860145041",
"analysis": {
"analyzer": {
"ngram_tokenizer_analyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer"
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "10"
}
}
},
"number_of_replicas": "2",
"uuid": "nOyI0O2cTO2JOFvqIoE8JQ",
"version": {
"created": "5010199"
}
}
}
}
}
Having as example a document:
{
{
"Id": 9072856,
"CreatedDateTime": "2017-01-12T11:26:58.413Z",
"CreatedById": 9108469,
"DisplayName": "valentinos",
"Email": "valentinos#mail.com",
"CustomerInsightTargets": [
{
"Id": 160,
"CustomerProfileId": 9072856,
"CustomerInsightSource": "Tags",
"CustomerInsightValue": "Tag1",
"Guid": "00000000-0000-0000-0000-000000000000"
},
{
"Id": 160,
"CustomerProfileId": 9072856,
"CustomerInsightSource": "ProfileName",
"CustomerInsightValue": "valentinos",
"Guid": "00000000-0000-0000-0000-000000000000"
},
{
"Id": 160,
"CustomerProfileId": 9072856,
"CustomerInsightSource": "Playground",
"CustomerInsightValue": "Wiki",
"Guid": "00000000-0000-0000-0000-000000000000"
}
]
}
}
If i ran an aggregation on the top_hits the result will include all targets from a document -> if one of them match my search text.
Example
GET customerinsights/_search
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "CustomerInsightTargets",
"query": {
"bool": {
"must": [
{
"match": {
"CustomerInsightTargets.CustomerInsightValue": {
"query": "2017",
"operator": "AND",
"fuzziness": 2
}
}
}
]
}
}
}
}
]
}
} ,
"aggs": {
"root": {
"nested": {
"path": "CustomerInsightTargets"
},
"aggs": {
"top_tags": {
"terms": {
"field": "CustomerInsightTargets.CustomerInsightSource.keyword"
},
"aggs": {
"top_tag_hits": {
"top_hits": {
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"size": 5,
"_source": "CustomerInsightTargets"
}
}
}
}
}
}
},
"size": 0,
"_source": "CustomerInsightTargets"
}
My question is how I should use the aggregation to get the "autocomplete" Values grouped by Source and order by the _score. I tried to use a significant_terms aggregation but doesn't work so well, also terms aggs doesn't sort by score (and by _count) and having fuzzy also adds complexity.
I have my analyzers set like this:
"analyzer": {
"edgeNgram_autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "autocomplete"]
},
"full_name": {
"filter":["standard","lowercase","asciifolding"],
"type":"custom",
"tokenizer":"standard"
}
My filter:
"filter": {
"autocomplete": {
"type": "edgeNGram",
"side":"front",
"min_gram": 1,
"max_gram": 50
}
Name field analyzer:
"textbox": {
"_parent": {
"type": "document"
},
"properties": {
"text": {
"fields": {
"text": {
"type":"string",
"analyzer":"full_name"
},
"autocomplete": {
"type": "string",
"index_analyzer": "edgeNgram_autocomplete",
"search_analyzer": "full_name",
"analyzer": "full_name"
}
},
"type":"multi_field"
}
}
}
Put all together, makes up my mapping for docstore index:
PUT http://localhost:9200/docstore
{
"settings": {
"analysis": {
"analyzer": {
"edgeNgram_autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "autocomplete"]
},
"full_name": {
"filter":["standard","lowercase","asciifolding"],
"type":"custom",
"tokenizer":"standard"
}
},
"filter": {
"autocomplete": {
"type": "edgeNGram",
"side":"front",
"min_gram": 1,
"max_gram": 50
} }
}
},
"mappings": {
"space": {
"properties": {
"name": {
"type": "string",
"index": "not_analyzed"
}
}
},
"document": {
"_parent": {
"type": "space"
},
"properties": {
"name": {
"type": "string",
"index": "not_analyzed"
}
}
},
"textbox": {
"_parent": {
"type": "document"
},
"properties": {
"bbox": {
"type": "long"
},
"text": {
"fields": {
"text": {
"type":"string",
"analyzer":"full_name"
},
"autocomplete": {
"type": "string",
"index_analyzer": "edgeNgram_autocomplete",
"search_analyzer": "full_name",
"analyzer":"full_name"
}
},
"type":"multi_field"
}
}
},
"entity": {
"_parent": {
"type": "document"
},
"properties": {
"bbox": {
"type": "long"
},
"name": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
Add a space to hold all docs:
POST http://localhost:9200/docstore/space
{
"name": "Space 1"
}
When user enters word: proj
this should return, all text:
SampleProject
Sample Project
Project Name
myProjectname
firstProjectName
my ProjectName
But it returns nothing.
My query:
POST http://localhost:9200/docstore/textbox/_search
{
"query": {
"match": {
"text": "proj"
}
},
"filter": {
"has_parent": {
"type": "document",
"query": {
"term": {
"name": "1-a1-1001.pdf"
}
}
}
}
}
If I search by project, I get:
{ "took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 3.0133555,
"hits": [
{
"_index": "docstore",
"_type": "textbox",
"_id": "AVRuV2d_f4y6IKuxK35g",
"_score": 3.0133555,
"_routing": "AVRuVvtLf4y6IKuxK33f",
"_parent": "AVRuV2cMf4y6IKuxK33g",
"_source": {
"bbox": [
8750,
5362,
9291,
5445
],
"text": [
"Sample Project"
]
}
},
{
"_index": "docstore",
"_type": "textbox",
"_id": "AVRuV2d_f4y6IKuxK35Y",
"_score": 2.4106843,
"_routing": "AVRuVvtLf4y6IKuxK33f",
"_parent": "AVRuV2cMf4y6IKuxK33g",
"_source": {
"bbox": [
8645,
5170,
9070,
5220
],
"text": [
"Project Name and Address"
]
}
}
]
}
}
Maybe my edgengram is not suited for this?
I am saying:
side":"front"
Should I do it differently?
Does anyone know what I am doing wrong?
The problem is with the autocomplete indexing analyzer field name.
Change:
"index_analyzer": "edgeNgram_autocomplete"
To:
"analyzer": "edgeNgram_autocomplete"
And also search like (#Andrei Stefan) showed in his answer:
POST http://localhost:9200/docstore/textbox/_search
{
"query": {
"match": {
"text.autocomplete": "proj"
}
}
}
And it will work as expected!
I have tested your configuration on Elasticsearch 2.3
By the way, type multi_field is deprecated.
Hope I have managed to help :)
Your query should actually try to match on text.autocomplete and not text:
"query": {
"match": {
"text.autocomplete": "proj"
}
}