Query only those documents where image field is not empty - elasticsearch

I have the following mapping **(dynamic strict on the type)**
"created": {
"type": "date"
},
"images": {
"properties": {
"checksum": {
"type": "text",
"index": false
},
"path": {
"type": "text",
"index": false
},
"url": {
"type": "text",
"index": false
}
}
},
I want to query documents where there is a image present
I tried couple of combinations but no luck so far.
This is the last i tried
POST catalog/_search
{
"query": {
"script": {
"script": "doc['images'].values.length > 0"
}
}
}
POST catalog/_search
{
"query": {
"script": {
"script": "doc['images.url'].values.length > 0"
}
}
}
But here it says that field data is not true for text fields. Is there anyway I can do this without changing my mapping.
Ideally this should give me all the records where there is no images. But this is returning all records
POST catalog/_search
{
"query": {
"bool": {
"must_not": [
{
"exists": {
"field": "images"
}
}
]
}
}
}
Here is the example document in which there is a image.
{
"_index": "catalog-2018-03-03",
"_type": "product",
"_id": "151755703145e27e4983a0bd1b70be44",
"_score": 1,
"_source": {
"merchant": {
"link": "http://shophive.com/",
"name": "shophive"
},
"images": [],
"updated": "2018-03-18T13:06:33.583480",
"name": "Plantronics Savi Talk",
"created": "2018-03-18T13:06:33.583459",
"url": "http://www.shophive.com/plantronics-savi-talk",
"price": {
"new": 24999,
"old": 24999,
"discount_percent": 0
},
"category": {
"level_1": {
"url": "computers/tablets/networking",
"name": "Computers/Tablets & Networking "
},
"level_2": {
"url": "tablets/ebook-readers",
"name": "Tablets & eBook Readers"
}
}
}
}
Updated
With the below query I am expecting that elasticsearch would return the documents in which image is missing
POST catalog/product/_search
{
"query": {
"bool": {
"must_not": [
{
"exists": {
"field": "images"
}
}
]
}
}
}
But the result i receive is all the documents in my index and apparently every document has one image. Here is the example document i get with above query
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 20967,
"max_score": 1,
"hits": [
{
"_index": "catalog-2018-03-03",
"_type": "product",
"_id": "151755703145e27e4983a0bd1b70be44",
"_score": 1,
"_source": {
"merchant": {
"link": "http://shophive.com/",
"name": "shophive"
},
"images": [
{
"url": "http://www.shophive.com/media/catalog/product/cache/1/small_image/165x/9df78eab33525d08d6e5fb8d27136e95/p/l/plantronics_savi_talk.jpg",
"path": "full/8e3587bd2b6107f0beafa9b1ba05f476539be0a8.jpg",
"checksum": "fa74ade23c8e80e9590d48d4e59b6b64"
}
],
"updated": "2018-03-18T13:06:33.583480",
"name": "Plantronics Savi Talk",
"created": "2018-03-18T13:06:33.583459",
"url": "http://www.shophive.com/plantronics-savi-talk",
"price": {
"new": 24999,
"old": 24999,
"discount_percent": 0
},
"category": {
"level_1": {
"url": "computers/tablets/networking",
"name": "Computers/Tablets & Networking "
},
"level_2": {
"url": "tablets/ebook-readers",
"name": "Tablets & eBook Readers"
}
}
}
}
}
}

You should leave out the the square brackets in the query as you only have one clause
POST /catalog/_search
{
"query": {
"bool": {
"must_not": {
"exists": {
"field": "images"
}
}
}
}
}
This returns the docs with out images for me and if you need only those that have images
POST /catalog/_search
{
"query": {
"exists": {
"field": "images"
}
}
}

Related

Elasticsearch - Nested field sorting

I have an index defined by the following :
{
"mappings": {
"properties": {
"firstName": {
"type": "keyword"
},
"lastName": {
"type": "keyword"
},
"affiliations": {
"type": "nested",
"properties": {
"organisation": {
"type": "keyword"
},
"team": {
"type": "keyword"
},
"dateBeginning": {
"type": "date",
"format": "yyyy-MM-dd"
},
"dateEnding": {
"type": "date",
"format": "yyyy-MM-dd"
},
"country": {
"type": "keyword"
}
}
}
}
}
}
Basically, for each researcher (researchers is how I named my index) I want to sort the the affiliations by dateBeginning, in descending order. I've read about inner hits in the EL official doc, and not being exactly sure how it works I've tried this for researcher with _id : 3 :
{
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": { "_id": 3 }
},
"inner_hits": {
"sort" : [
{
"affiliations.dateBeginning" : {
"order" : "desc",
"nested": {
"path": "affiliations",
"filter": {
"term": { "_id": 3 }
}
}
}
}
]
}
}
}
}
And it doesn't really work.
Having two affiliation for researchers with _id : 3, with one dateBeginning set on 2015-06-30, and the other on 2017-06-30. So I've tried this also :
{
"sort" : [
{
"affiliations.dateBeginning" : {
"order" : "desc",
"nested": {
"path": "affiliations"
}
}
}
],
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": { "_id": 3 }
}
}
}
}
And it doesn't sort the affiliations by dateBeginning.
I've also tried to do it with the SQL API (since I'm more familiar with SQL language), and still, I can't get the data I want.
So I'm quite new to ElasticSearch, I'm using version 7.10, and I don't know what else to do.
Any suggestions about what I'm doing wrong here ?
EDIT
here's an example of a document from that index:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [{
"_index": "researchers",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"firstName": "Kimmich",
"lastName": "Yoshua",
"affiliations": [{
"organisation": "University of Ottawa",
"team": "Neural Network Elite Team",
"dateBeginning": "2015-06-30",
"datEnding": "2017-01-31",
"country": "Canada"
},
{
"organisation": "University of Montréal",
"team": "Picture processing team",
"dateBeginning": "2017-06-30",
"dateEnding": null,
"country": "Canada"
}
]
}
}]
}
}
Once you're inside the nested query, the inner hits don't need the extra nested query. Remove it and the sort will work properly:
{
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": {
"_id": 3
}
},
"inner_hits": {
"sort": [
{
"affiliations.dateBeginning": {
"order": "desc"
}
}
]
}
}
}
}
Note that this wouldn't sort the top-level hits -- only the inner hits.
But you can sort on the top level by the values of affiliations.dateBeginning like so:
POST researchers/_search
{
"sort": [
{
"affiliations.dateBeginning": {
"order": "desc",
"nested_path": "affiliations"
}
}
]
}
but note that the syntax is now slightly different: instead of path we're saying nested_path.

How to return only the matched object in a nested field, instead of the whole object?

Elasticsearch version ( bin/elasticsearch --version ):
5.6.11
Plugins installed :
JVM version ( java -version ):
openjdk version "1.8.0_222"
OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~16.04.1-b10)
OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
OS version ( uname -a if on a Unix-like system):
Linux myserver 4.4.0-131-generic #157-Ubuntu SMP Thu Jul 12 15:51:36 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
I'm currently trying to prevent if possible having to create another index, and as result i got to the question:
How do I retrieve only the matched element in a nested object instead of the whole object? in my case being the: my_nested_field.my_object.name that matches my criteria?
my mapping:
{
"my_super_special_index": {
"aliases": {
"my_super_special_index_alias": {}
},
"mappings": {
"my_super_special_index": {
"properties": {
"my_nested_field": {
"type": "nested",
"properties": {
"my_object": {
"properties": {
"id": {
"type": "integer"
},
"last_known_party": {
"type": "boolean"
},
"name": {
"type": "text",
"store": true,
"analyzer": "translation_index_analyzer",
"search_analyzer": "translation_search_analyzer"
},
"name_raw": {
"type": "keyword"
},
}
}
}
}
}
}
}
}
}
my query:
GET my_super_special_index/_search
{
"_source": "my_nested_field",
"query": {
"bool": {
"should": [
{
"nested": {
"path": "my_nested_field",
"query": {
"bool": {
"should": [
{
"match_phrase": {
"my_nested_field.my_object.name": {
"query": "my name"
}
}
},
{
"match": {
"my_nested_field.my_object.name": {
"query": "my name",
"boost": 100
}
}
}
]
}
}
}
}
],
"minimum_should_match": 1
}
},
"size": 50
}
What im getting as result :
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1337,
"hits": [
{
"_index": "my_super_special_index",
"_type": "my_super_special_index",
"_score": 2433.0676,
"_source": {
"id": "4712182",
"my_nested_fields": [
{
...
"my_object": [
{
"id": "22222",
"name": "name i want",
"name_raw": "name i want",
"last_known_party": true
},
{
"id": "13333",
"name": "hiyoo definitely doesnt match",
"name_raw": "hiyoo definitely doesnt match",
"last_known_party": true
}
],
"my_other_object": [
{
"id": "26672",
"name": "dont really like this",
"name_raw": "dont really like this",
"last_known_party": true
}
]
}
],
}
},
{
"_index": "my_super_special_index",
"_type": "my_super_special_index",
"_score": 2422.111,
"_source": {
"id": "357878",
"my_nested_fields": [
{
...
"my_object": [
{
"id": "661111",
"name": "ratatoille",
"name_raw": "ratatoille",
"last_known_party": true
},
{
"id": "2334",
"name": "name i want or close match",
"name_raw": "name i want or close match",
"last_known_party": true
}
],
"my_other_object": [
{
"id": "63111",
"name": "ttttt ok 1337",
"name_raw": "ttttt ok 1337",
"last_known_party": true
}
]
}
],
}
}
}
}
What I wish to get from ES:
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1337,
"hits": [
{
"_index": "my_super_special_index",
"_type": "my_super_special_index",
"_score": 2433.0676,
"_source": {
"id": "4712182",
"my_nested_fields": [
{
...
"my_object": [
{
"id": "22222",
"name": "name i want",
"name_raw": "name i want",
"last_known_party": true
}
]
}
],
}
},
{
"_index": "my_super_special_index",
"_type": "my_super_special_index",
"_score": 2422.111,
"_source": {
"id": "357878",
"my_nested_fields": [
{
...
"my_object": [
{
"id": "2334",
"name": "name i want or close match",
"name_raw": "name i want or close match",
"last_known_party": true
}
]
}
],
}
}
}
}
Thanks in advance!
Nested inner_hist to the rescue:
GET my_super_special_index/_search
{
"_source": "my_nested_field",
"query": {
"bool": {
"should": [
{
"nested": {
"path": "my_nested_field",
"inner_hits": {}, <--- add this
"query": {
"bool": {
"should": [
{
"match_phrase": {
"my_nested_field.my_object.name": {
"query": "my name"
}
}
},
{
"match": {
"my_nested_field.my_object.name": {
"query": "my name",
"boost": 100
}
}
}
]
}
}
}
}
],
"minimum_should_match": 1
}
},
"size": 50
}
Note: Since my_object is not nested as well, you'll still get the full array, but you'll only get the my_nested_fields object and not the my_other_object one.

elastic bool query must match mot getting considered

i am basically trying to write a query where it should return the document where
school is "holy international" AND grade is "second".
but the issue with the current query is that its not considering the must match query part. ie even though i don't i specify the school is the giving me this document where as it is not a match.
query is giving me all the documents where the grade is second.
i want only document where school is "holy international" AND grade is "second".
as well as i have not specified in the match query for "schools.school" but its giving me results.
mapping
{
"settings": {
"analysis": {
"analyzer": {
"my_keyword_lowercase1": {
"tokenizer": "keyword",
"filter": ["lowercase", "my_pattern_replace1", "trim"]
},
"my_keyword_lowercase2": {
"tokenizer": "standard",
"filter": ["lowercase", "trim"]
}
},
"filter": {
"my_pattern_replace1": {
"type": "pattern_replace",
"pattern": ".",
"replacement": ""
}
}
}
},
"mappings": {
"test_data": {
"properties": {
"schools": {
"type": "nested",
"properties": {
"school": {
"type": "string",
"analyzer": "my_keyword_lowercase1"
},
"grade": {
"type": "string",
"analyzer": "my_keyword_lowercase2"
}
}
}
}
}
}
}
data
{
"_index": "data_index",
"_type": "test_data",
"_id": "57a33ebc1d41",
"_version": 1,
"found": true,
"_source": {
"summary": null,
"schools": [{
"school": "little flower",
"grade": "first",
"date": "2007-06-01",
},
{
"school": "holy international",
"grade": "second",
"date": "2007-06-01",
},
],
"first_name": "Adam",
"location": "Kansas City",
"last_name": "Roger",
"country": "US",
"name": "Adam Roger",
}
}
query
{
"_source": ["first_name"],
"query": {
"nested": {
"path": "schools",
"inner_hits": {
"_source": {
"includes": [
"schools.school",
"schools.grade"
]
}
},
"query": {
"bool": {
"must": {
"match": {
"schools.school": {
"query": "" <-----X didnt specify anything
}
}
},
"filter": {
"match": {
"schools.grade": {
"query": "second",
"operator": "and",
"minimum_should_match": "100%"
}
}
}
}
}
}
}
}
result
{
"took": 26,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_index": "data_test",
"_type": "test_data",
"_id": "57a33ebc1d41",
"_score": 0.2876821,
"_source": {
"first_name": "Adam"
},
"inner_hits": {
"schools": {
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_nested": {
"field": "schools",
"offset": 0
},
"_score": 0.2876821,
"_source": {
"schools": {
"school": "holy international",
"grade": "second"
}
}
}
]
}
}
}
}
]
}
}
So, basically your problem is analysis step, when I load everything and checked, it become very clear:
This filter completely wipes all string from schools.school field
"filter": {
"my_pattern_replace1": {
"type": "pattern_replace",
"pattern": ".",
"replacement": ""
}
}
I think, that's happening because . is regexp literal, so, when I checked it:
POST /_analyze
{
"field": "schools.school",
"text": "holy international"
}
{
"tokens": [
{
"token": "",
"start_offset": 0,
"end_offset": 18,
"type": "word",
"position": 0
}
]
}
That's why you always get a match, every string you passed during indexing time and during search time becomes "". Some additional info from Elastic wiki - https://www.elastic.co/guide/en/elasticsearch/reference/5.1/analysis-pattern_replace-tokenfilter.html
After I removed patter replace filter, this query returns everything as expected:
{
"_source": ["first_name"],
"query": {
"nested": {
"path": "schools",
"inner_hits": {
"_source": {
"includes": [
"schools.school",
"schools.grade"
]
}
},
"query": {
"bool": {
"must": {
"match": {
"schools.school": {
"query": "holy international"
}
}
},
"filter": {
"match": {
"schools.grade": {
"query": "second"
}
}
}
}
}
}
}
}

Elastic Search: Aggregation sum on a particular field

I am new to elastic search and requesting some help.
Basically I have some 2 million documents in my elastic search and the documents look like below:
{
"_index": "flipkart",
"_type": "PSAD_ThirdParty",
"_id": "430001_MAM_2016-02-04",
"_version": 1,
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 206,
"bookedNumbers": 15000,
"ut": 1454567400000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
I want to write an aggregation query which satisfies below conditions:
1) First query based on "_index", "_type" and "pliId".
2) Do aggregation sum on metrics.value based on metrics.id = "Metric1".
Basically I need to query records based on some fields and aggregate sum on a particular metrics value based on metrics id.
Please can you help me in getting my query right.
Your metrics field needs to be of type nested:
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
}
}
}
If you want Metric1 to match, meaning upper-case letter, then as you see above the id needs to be not_analyzed.
Then, if you only want metrics.id = "Metric1" aggregations, you need something like this:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"pliId": 206
}
}
]
}
}
}
},
"aggs": {
"by_metrics": {
"nested": {
"path": "metrics"
},
"aggs": {
"metric1_only": {
"filter": {
"bool": {
"must": [
{
"term": {
"metrics.id": {
"value": "Metric1"
}
}
}
]
}
},
"aggs": {
"by_metric_id": {
"terms": {
"field": "metrics.id"
},
"aggs": {
"total_delivery": {
"sum": {
"field": "metrics.value"
}
}
}
}
}
}
}
}
}
}
Created new index:
Method : PUT ,
URL : http://localhost:9200/google/
Body:
{
"mappings": {
"PSAD_Primary": {
"properties": {
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
},
"value": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then I inserted some 200 thousand documents and than ran the query and it worked.
Response:
{
"took": 34,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "google",
"_type": "PSAD_Primary",
"_id": "383701291_MAM_2016-01-06",
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 221244,
"bookedNumbers": 15000,
"ut": 1452061800000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
]
},
"aggregations": {
"by_metrics": {
"doc_count": 3,
"metric1_only": {
"doc_count": 1,
"by_metric_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Metric1",
"doc_count": 1,
"total_delivery": {
"value": 70
}
}
]
}
}
}
}
}

elasticsearch retrieving nested objects - not individual fields

When I use the "fields" option of a query I get a separate array for each field. Is it possible to get back the "complete" nested objects rather than just the field?
In the following example if I try to do "fields": ["cast"] it tells me that cast is not a leaf node. And if I do "fields": ["cast.firstName", "cast.middleName", "cast.lastName"] it returns 3 arrays.
Is there another way of retrieving just a partial amount of the document? Or is there a way to "reassemble" the separate fields into a complete "cast" object?
Example Index and Data:
POST /movies
{
"mappings": {
"movie": {
"properties": {
"cast": {
"type": "nested"
}
}
}
}
}
POST /movies/movie
{
"title": "The Matrix",
"cast": [
{
"firstName": "Keanu",
"lastName": "Reeves",
"address": {
"street": "somewhere",
"city": "LA"
}
},
{
"firstName": "Laurence",
"middleName": "John",
"lastName": "Fishburne",
"address": {
"street": "somewhere else",
"city": "NYC"
}
}
]
}
Example Query:
GET /movies/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "cast",
"filter": {
"bool": {
"must": [
{ "term": { "firstName": "laurence"} },
{ "term": { "lastName": "fishburne"} }
]
}
}
}
}
}
},
"fields": [
"cast.address.city",
"cast.firstName",
"cast.middleName",
"cast.lastName"
]
}
Result of example query:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "movies",
"_type": "movie",
"_id": "AU1JeyBseLgwMCOuOLsZ",
"_score": 1,
"fields": {
"cast.firstName": [
"Keanu",
"Laurence"
],
"cast.lastName": [
"Reeves",
"Fishburne"
],
"cast.address.city": [
"LA",
"NYC"
],
"cast.middleName": [
"John"
]
}
}
]
}
}
I think this is what you're looking for:
POST /movies/_search
{
"_source": {
"include": [
"cast.address.city",
"cast.firstName",
"cast.middleName",
"cast.lastName"
]
},
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "cast",
"filter": {
"bool": {
"must": [
{
"term": {
"firstName": "laurence"
}
},
{
"term": {
"lastName": "fishburne"
}
}
]
}
}
}
}
}
}
}
Result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "movies",
"_type": "movie",
"_id": "AU1PIJgBA_0Cyshym7-m",
"_score": 1,
"_source": {
"cast": [
{
"lastName": "Reeves",
"address": {
"city": "LA"
},
"firstName": "Keanu"
},
{
"middleName": "John",
"lastName": "Fishburne",
"address": {
"city": "NYC"
},
"firstName": "Laurence"
}
]
}
}
]
}
}
You can also choose to exclude fields instead of including or both, see documentation here: http://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-source-filtering.html

Resources