Elastic search top_hits aggregation on nested - elasticsearch

I have an index which contains CustomerProfile documents. Each of this document in the CustomerInsightTargets(with the properties Source,Value) property can be an array with x items. What I am trying to achieve is an autocomplete (of top 5) on CustomerInsightTargets.Value grouped by CustomerInisghtTarget.Source.
It will be helpful if anyone gives me hint about how to select only a subset of nested objects from each document and use that nested obj in aggregations.
{
"customerinsights": {
"aliases": {},
"mappings": {
"customerprofile": {
"properties": {
"CreatedById": {
"type": "long"
},
"CreatedDateTime": {
"type": "date"
},
"CustomerInsightTargets": {
"type": "nested",
"properties": {
"CustomerInsightSource": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"CustomerInsightValue": {
"type": "text",
"term_vector": "yes",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "ngram_tokenizer_analyzer"
},
"CustomerProfileId": {
"type": "long"
},
"Guid": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Id": {
"type": "long"
}
}
},
"DisplayName": {
"type": "text",
"term_vector": "yes",
"analyzer": "ngram_tokenizer_analyzer"
},
"Email": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Id": {
"type": "long"
},
"ImageUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
},
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "customerinsights",
"creation_date": "1484860145041",
"analysis": {
"analyzer": {
"ngram_tokenizer_analyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer"
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "10"
}
}
},
"number_of_replicas": "2",
"uuid": "nOyI0O2cTO2JOFvqIoE8JQ",
"version": {
"created": "5010199"
}
}
}
}
}
Having as example a document:
{
{
"Id": 9072856,
"CreatedDateTime": "2017-01-12T11:26:58.413Z",
"CreatedById": 9108469,
"DisplayName": "valentinos",
"Email": "valentinos#mail.com",
"CustomerInsightTargets": [
{
"Id": 160,
"CustomerProfileId": 9072856,
"CustomerInsightSource": "Tags",
"CustomerInsightValue": "Tag1",
"Guid": "00000000-0000-0000-0000-000000000000"
},
{
"Id": 160,
"CustomerProfileId": 9072856,
"CustomerInsightSource": "ProfileName",
"CustomerInsightValue": "valentinos",
"Guid": "00000000-0000-0000-0000-000000000000"
},
{
"Id": 160,
"CustomerProfileId": 9072856,
"CustomerInsightSource": "Playground",
"CustomerInsightValue": "Wiki",
"Guid": "00000000-0000-0000-0000-000000000000"
}
]
}
}
If i ran an aggregation on the top_hits the result will include all targets from a document -> if one of them match my search text.
Example
GET customerinsights/_search
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "CustomerInsightTargets",
"query": {
"bool": {
"must": [
{
"match": {
"CustomerInsightTargets.CustomerInsightValue": {
"query": "2017",
"operator": "AND",
"fuzziness": 2
}
}
}
]
}
}
}
}
]
}
} ,
"aggs": {
"root": {
"nested": {
"path": "CustomerInsightTargets"
},
"aggs": {
"top_tags": {
"terms": {
"field": "CustomerInsightTargets.CustomerInsightSource.keyword"
},
"aggs": {
"top_tag_hits": {
"top_hits": {
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"size": 5,
"_source": "CustomerInsightTargets"
}
}
}
}
}
}
},
"size": 0,
"_source": "CustomerInsightTargets"
}
My question is how I should use the aggregation to get the "autocomplete" Values grouped by Source and order by the _score. I tried to use a significant_terms aggregation but doesn't work so well, also terms aggs doesn't sort by score (and by _count) and having fuzzy also adds complexity.

Related

Get all the buckets for a aggregate elastic search

I want to get all the buckets available for a particular aggregate. Is there any query or endpoint to get the buckets?
Below is my Mapping. If I query with any filter then the related buckets are coming up, but I want all the buckets to show it on the frontend to have or operations.
Example: If we have 2 records, one is with category as chair and the other is in the table. If I select a chair it is returning table count is zero but it should show as table count as 1. So user can select both.
MyMapping:
{
"properties": {
"australiasellable": {
"type": "boolean"
},
"avgRating": {
"type": "float"
},
"categories": {
"type": "nested"
},
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"categorycode": {
"type": "text",
"fielddata": true
},
"categoryname": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"colour": {
"type": "text",
"fielddata": true
},
"commercialuse": {
"type": "boolean"
},
"customisable": {
"type": "boolean"
},
"depth": {
"type": "float"
},
"freedelivery": {
"type": "boolean"
},
"height": {
"type": "float"
},
"listprice": {
"type": "float"
},
"location": {
"type": "geo_point"
},
"material": {
"type": "text",
"fielddata": true
},
"materialcode": {
"type": "text",
"fielddata": true
},
"message": {
"type": "geo_point"
},
"numberOfRating": {
"type": "long"
},
"online": {
"type": "boolean"
},
"outdooruse": {
"type": "boolean"
},
"productid": {
"type": "long"
},
"productimageurl": {
"type": "text",
"fielddata": true
},
"productname": {
"type": "text",
"fielddata": true
},
"producttypecode": {
"type": "text",
"fielddata": true
},
"sellercode": {
"type": "text",
"fielddata": true
},
"sellerdescription": {
"type": "text",
"fielddata": true
},
"shortdescription": {
"type": "text",
"fielddata": true
},
"sku": {
"type": "text",
"fielddata": true
},
"state": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"stylecode": {
"type": "text",
"fielddata": true
},
"warrantycode": {
"type": "text",
"fielddata": true
},
"weight": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"width": {
"type": "float"
}
}
}
Regards,
Sreenivas
A possible solution would be not to set the filter in the query section of your payload but rather perform filtered aggregations and use the top_hits to get the _sources of the matched docs.
Long story short, if you apply a query, it'll of course affect your aggregations. So the trick is to not apply any query (either match_all or remove the whole query object) and perform the queries in the sub-aggregations as follows:
Using your category field:
GET your_index/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"actual_query_agg": {
"filter": {
"term": {
"category.keyword": {
"value": "chair"
}
}
},
"aggs": {
"actual_query_agg_top_hits": {
"top_hits": {
"_source": [
"category"
],
"size": 10
}
}
}
},
"excluding_my_query_filtered_agg": {
"filter": {
"bool": {
"must_not": {
"term": {
"category.keyword": "chair"
}
}
}
},
"aggs": {
"by_other_categories_agg": {
"terms": {
"field": "category.keyword",
"size": 10
},
"aggs": {
"categorized_other_docs_agg_top_hits": {
"top_hits": {
"_source": [
"category"
],
"size": 10
}
}
}
}
}
}
}
}
You can get rid of the top_hits sub-aggregations if you're just interested in the counts and not the underlying docs, i.e.:
GET your_index/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"actual_query_agg": {
"filter": {
"term": {
"category.keyword": {
"value": "chair"
}
}
}
},
"excluding_my_query_filtered_agg": {
"filter": {
"bool": {
"must_not": {
"term": {
"category.keyword": "chair"
}
}
}
},
"aggs": {
"by_other_categories_agg": {
"terms": {
"field": "category.keyword",
"size": 10
}
}
}
}
}
}

Elasticsearch query for multiple terms

I am trying to create a search query that allows to search by name and type.
I have indexed the values, and my record in Elasticsearch look like this:
{
_index: "assets",
_type: "asset",
_id: "eAOEN28BcFmQazI-nngR",
_score: 1,
_source: {
name: "test.png",
mediaType: "IMAGE",
meta: {
content-type: "image/png",
width: 3348,
height: 1890,
},
createdAt: "2019-12-24T10:47:15.727Z",
updatedAt: "2019-12-24T10:47:15.727Z",
}
}
so how would I create for example, a query that finds all assets that have the name "test' and are images?
I tried multi_mach query but that did not return the correct results:
{
"query": {
"multi_match" : {
"query": "*test* IMAGE",
"type": "cross_fields",
"fields": [ "name", "mediaType" ],
"operator": "and"
}
}
}
The query above returns 0 results, and if I change the operator to "or" it returns all this assets of type IMAGE.
Any suggestions would be greatly appreciated. TIA!
EDIT: Added Mapping
Below is the mapping:
{
"assets": {
"aliases": {},
"mappings": {
"properties": {
"__v": {
"type": "long"
},
"createdAt": {
"type": "date"
},
"deleted": {
"type": "date"
},
"mediaType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"meta": {
"properties": {
"content-type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"width": {
"type": "long"
},
"height": {
"type": "long"
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"originalName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"updatedAt": {
"type": "date"
}
}
},
"settings": {
"index": {
"creation_date": "1575884312237",
"number_of_shards": "1",
"number_of_replicas": "1",
"uuid": "nSiAoIIwQJqXQRTyqw9CSA",
"version": {
"created": "7030099"
},
"provided_name": "assets"
}
}
}
}
You are unnecessary using the wildcard expression for this simple query.
First, change your analyzer on name field.
You need to create a custom analyzer which replaces . with space as default standard analyzer doesn't do that, so that you when searching for test you get test.png as there will be both test and png in the inverted index. The main benefit of doing this is to avoid the regex queries which are very costly.
Updated mapping with custom analyzer which would do the work for you. Just update your mapping and re-index again all the doc.
{
"aliases": {},
"mappings": {
"properties": {
"__v": {
"type": "long"
},
"createdAt": {
"type": "date"
},
"deleted": {
"type": "date"
},
"mediaType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"meta": {
"properties": {
"content-type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"width": {
"type": "long"
},
"height": {
"type": "long"
}
}
},
"name": {
"type": "text",
"analyzer" : "my_analyzer"
},
"originalName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"updatedAt": {
"type": "date"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"replace_dots"
]
}
},
"char_filter": {
"replace_dots": {
"type": "mapping",
"mappings": [
". => \\u0020"
]
}
}
},
"index": {
"number_of_shards": "1",
"number_of_replicas": "1"
}
}
}
Second, you should change your query to bool query as below:
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "test"
}
},
{
"match": {
"mediaType.keyword": "IMAGE"
}
}
]
}
}
}
Which is using must with 2 match queries means, that it would return docs only when there is a match in all the clauses of must query.
I already tested my solution by creating the index, inserting a few sample docs and query them, let me know if you need any help.
Did you tried with best_fields ?
{
"query": {
"multi_match" : {
"query": "Will Smith",
"type": "best_fields",
"fields": [ "name", "mediaType" ],
"operator": "and"
}
}
}

Query dynamic object type in ElasticSearch

Having the below field definition in a template.
"details": {
"dynamic": "true",
"type": "object"
}
This field is dynamically set based on the JSON schema, an example is the below.
"details": {
"responses": [
{
"questionKind": "multiple_choice",
"text": "Request Subscription",
"questionOrder": 1,
"order": 1
}
]
}
Is it possible to query ElasticSearch dynamic object datatype? I have tried the below but with no success.
{
"query": {
"bool": {
"must": [
{
"match": {
"details.responses.questionKind": "multiple_choice"
}
}
]
}
}
}
Full index template:
{
"index_patterns": "sponsorshipsinfluencers*",
"order": 3,
"version": 3,
"aliases": {
"sponsorshipsinfluencers": {}
},
"settings": {
"number_of_shards": 5,
"analysis": {
"normalizer": {
"lowercase_normalizer": {
"type": "custom",
"char_filter": [],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"dynamic": "true",
"properties": {
"id": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"influencerId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"campaignId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"campaignSponsorshipSetId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"campaignSponsorshipId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"acceptedDate": {
"type": "date"
},
"declinedDate": {
"type": "date"
},
"completedDate": {
"type": "date"
},
"paidDate": {
"type": "date"
},
"status": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"campaignType": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"createdTimestampEpochInMilliseconds": {
"type": "date",
"format": "epoch_millis",
"index": false
},
"updatedTimestampEpochInMilliseconds": {
"type": "date",
"format": "epoch_millis",
"index": false
},
"createdDate": {
"type": "date"
},
"updatedDate": {
"type": "date"
},
"details": {
"dynamic": "true",
"type": "object"
}
}
}
}
I would try with out the details section it might be the document type name :
{
"query": {
"bool": {
"must": [
{
"match": {
"responses.questionKind": "multiple_choice"
}
}
]
}
}
}

Negative values in Elasticsearch range queries

I have find this problem while making a watch in Elasticsearch, this is my query:
"body": {
"query": {
"bool": {
"must": [
{
"range": {
"percent": {
"lt": 100
}
It returns successfully every document with percent between 0 and 99, however it ignores those with negative value. The "percent" field is mapped as long number in the index.
Can you help me?
Thanks
Edit: Return of executing "curl -XGET localhost:9200/monthly-tickets-2018-06"
{
"monthly-tickets-2018-06": {
"aliases": {},
"mappings": {
"monthly_tickets": {
"properties": {
"percent": {
"type": "long"
},
"priority": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"project": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ref": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"timestamp": {
"type": "date"
}
}
}
},
"settings": {
"index": {
"creation_date": "1528946562231",
"number_of_shards": "5",
"number_of_replicas": "1",
"uuid": "aIfLjFwqS_aCzQFvZm0L5Q",
"version": {
"created": "6020399"
},
"provided_name": "monthly-tickets-2018-06"
}
}
}
}

Must Match two different terms

I am looking to filter results where two sets of data match
I get hits when I specify "should" but not "must"
Here is my query works as expected with just the one "match" but if I add a second I get no hits yet there are definitely records in the index that have productSpecification.value of Brand and 3 Years
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "control*",
"fields": [
"name^15",
"description^5",
"productCode"
]
}
}
]
}
}
"post_filter": {
"nested": {
"path": "productSpecification",
"query": {
"bool":{
"must": [
{
"match": {
"productSpecification.value":"3 years"
}
},
{
"match": {
"productSpecification.value":"Brand"
}
}
]
}
}
}
}
}
Just banging my head against the desk now trying different combinations of JSON trying to get this to return some values
{
"myindex": {
"mappings": {
"product": {
"properties": {
"description": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"dispatchTimeInDays": {
"type": "integer"
},
"height": {
"type": "integer"
},
"html": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"leadTimeInDays": {
"type": "integer"
},
"length": {
"type": "integer"
},
"limitedStock": {
"type": "boolean"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"notes": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"price": {
"type": "double"
},
"productBrandId": {
"type": "integer"
},
"productCategory": {
"properties": {
"code": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"fullPath": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"parentProductCategoryId": {
"type": "integer"
},
"productCategoryId": {
"type": "integer"
}
}
},
"productCategoryId": {
"type": "integer"
},
"productCode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"productId": {
"type": "integer"
},
"productImage": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"productSpecification": {
"type": "nested",
"properties": {
"description": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "keyword"
},
"productId": {
"type": "long"
},
"productSpecificationId": {
"type": "long"
},
"specificationId": {
"type": "long"
},
"value": {
"type": "keyword"
}
}
},
"productTypeId": {
"type": "integer"
},
"reviewRating": {
"type": "double"
},
"reviewRatingCount": {
"type": "integer"
},
"sellingPriceGroupId": {
"type": "integer"
},
"stockAvailable": {
"type": "integer"
},
"taxRateId": {
"type": "integer"
},
"url": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"weightKg": {
"type": "double"
},
"width": {
"type": "integer"
}
}
}
}
}
}
Here is an example of a product I would expect to be returned with this query it has a productSpecification.value of "3 years" AND a productSpecification.value of "Brand"
{
"_index": "myindex",
"_type": "product",
"_id": "uQEDbGEBfHre1rYmtsWB",
"_score": 141.5985,
"_source": {
"productId": 14587,
"name": "Brand Wave Multi Channel Remote Control",
"productCode": "111",
"productCategoryId": 17,
"length": 3,
"height": 0,
"productTypeId": 1,
"url": "brand-wave-multi-channel-remote-control",
"productBrandId": 3,
"width": 0,
"dispatchTimeInDays": 3,
"leadTimeInDays": 3,
"stockAvailable": 0,
"weightKg": 0.001,
"reviewRatingCount": 0,
"limitedStock": false,
"price": 63,
"productImage": "Wave-Remote-Control.jpg",
"productCategory": {
"productCategoryId": 17,
"name": "Accessories",
"fullPath": "Accessories",
"code": "00011"
},
"productSpecification": [{
"productSpecificationId": 852888,
"productId": 14587,
"specificationId": 232,
"name": "Brand",
"description": "This is the product manufacturer",
"value": "Brand"
},
{
"productSpecificationId": 852889,
"productId": 14587,
"specificationId": 92,
"name": "Type",
"value": "Remote control"
},
{
"productSpecificationId": 852891,
"productId": 14587,
"specificationId": 10,
"name": "Guarantee",
"value": "3 years"
},
{
"productSpecificationId": 852892,
"productId": 14587,
"specificationId": 599,
"name": "Power Voltage",
"value": "1.5 V"
},
{
"productSpecificationId": 852893,
"productId": 14587,
"specificationId": 29,
"name": "Dimensions",
"value": "157mm x 38mm x 19mm"
},
{
"productSpecificationId": 852894,
"productId": 14587,
"specificationId": 602,
"name": "Operation Range",
"value": "Up to 40m"
},
{
"productSpecificationId": 852895,
"productId": 14587,
"specificationId": 601,
"name": "Power Supply",
"value": "3V DC; 2 x AAA batteries"
}
]
}
}
After numerous amends my query is now like
{
"size": 100,
"aggs": {
"specifications": {
"nested": {
"path": "productSpecification"
},
"aggs": {
"groups": {
"terms": {
"field": "productSpecification.name"
},
"aggs": {
"attribute": {
"terms": {
"field": "productSpecification.value"
}
}
}
}
}
},
"price_range": {
"range": {
"field": "price",
"ranges": [
{
"to": 50
},
{
"from": 50,
"to": 100
},
{
"from": 100,
"to": 150
},
{
"from": 150,
"to": 200
},
{
"from": 200,
"to": 250
},
{
"from": 250
}
]
}
}
},
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "control*",
"fields": [
"name^15",
"description^5",
"productCode"
]
}
}
]
}
},
"post_filter": {
"query":{
"nested": {
"path": "productSpecification",
"query": {
"bool":{
"should": [{
"bool": {
"must":[{
"term": {
"productSpecification.name.keyword": "Brand"
}
},
{
"term": {
"productSpecification.value": "Brand"
}
}
]
}
},
{
"bool": {
"must": [
{
"term": {
"productSpecification.name.keyword": "Guarantee"
}
},
{
"term": {
"productSpecification.value": "3 years"
}
}
]
}
}
]
}
}
}
}
}
}
productSpecification.value is a keyword datatype. You should query against it with term query instead of match. And then you can't use must because if a doc has brand as value can't have also 3 years as value. In your case you will use should, because is an OR logical operator
{
"query": {
"nested": {
"path": "productSpecification",
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"term": {
"productSpecification.name.keyword": "Brand"
}
},
{
"term": {
"productSpecification.value": "Brand"
}
}
]
}
},
{
"bool": {
"must": [
{
"term": {
"productSpecification.name.keyword": "Guarantee"
}
},
{
"term": {
"productSpecification.value": "3 years"
}
}
]
}
}
]
}
}
}
}
}
Finally got this working after lots of experimentation / reading
posting here in case it is of use to others with similar problems
{
"post_filter": {
"bool": {
"filter": [{
"nested": {
"path": "productSpecification",
"query": {
"bool": {
"filter": [{
"term": {
"productSpecification.name": "Brand"
}
},
{
"terms": {
"productSpecification.value": [
"Brand1"
]
}
}
]
}
}
}
},
{
"nested": {
"path": "productSpecification",
"query": {
"bool": {
"filter": [{
"term": {
"productSpecification.name": "Guarantee"
}
},
{
"terms": {
"productSpecification.value": [
"3 years"
]
}
}
]
}
}
}
}
]
}
}
}

Resources