Elasticsearch - Search with wildcards - elasticsearch

I've managed to populate my index with 4 documents using this bulk request:
POST localhost:9200/titles/movies/_bulk
{"index":{"_id":"1"}}
{"id": "1","level": "first","titles": [{"value": "The Bad and the Beautiful","type": "Catalogue","main": true},{"value": "The Bad and the Beautiful (1945)","type": "International","main": false}]}
{"index":{"_id":"2"}}
{"id": "2","level": "first","titles": [{"value": "Bad Day at Black Rock","type": "Drama","main": true}]}
{"index":{"_id":"3"}}
{"id": "3","level": "second","titles": [{"value": "Baker's Wife","type": "AnotherType","main": true},{"value": "Baker's Wife (1940)","type": "Trasmitted","main": false}]}
{"index":{"_id":"4"}}
{"id": "4","level": "second","titles": [{"value": "Bambi","type": "Educational","main": true},{"value": "The Baby Deer and the hunter (1942)","type": "Fantasy","main": false}]}
Now how can I perform searches with wildcards on all available titles?
Something like
localhost:9200/titles/movies/_search?q=*&sort=level:asc
but providing one or more wilcards. For instance searching for "The % the %" and parsing the response from elasticsearch to eventually return something like:
{
"count":2,
"results":[{
"id":"1",
"level":"first",
"foundInTitleTypes":["Catalogue","International"]
},{
"id":"4",
"level":"second",
"foundInTitleTypes":["Fantasy"]
}]
}
Thanks!

Elasticsearch provides regex support in the the regular match query
GET titles/movies/_search
{
"query": {
"match" : { "titles.value" : "The * the *" }
}
}
Gives you this
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.6406528,
"hits": [
{
"_index": "titles",
"_type": "movies",
"_id": "4",
"_score": 1.6406528,
"_source": {
"id": "4",
"level": "second",
"titles": [
{
"value": "Bambi",
"type": "Educational",
"main": true
},
{
"value": "The Baby Deer and the hunter (1942)",
"type": "Fantasy",
"main": false
}
]
}
},
{
"_index": "titles",
"_type": "movies",
"_id": "1",
"_score": 0.9026783,
"_source": {
"id": "1",
"level": "first",
"titles": [
{
"value": "The Bad and the Beautiful",
"type": "Catalogue",
"main": true
},
{
"value": "The Bad and the Beautiful (1945)",
"type": "International",
"main": false
}
]
}
}
]
}
}
To update to your question URI search, I'm not sure if it is possible, if you do it with curl you just omit the query dsl as data
curl localhost:9200/titles/movies/_search -d '{"query":{"match":{"titles.value":"The * the *"}}}'
{"took":46,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":2,"max_score":1.6406528,"hits":[{"_index":"titles","_type":"movies","_id":"4","_score":1.6406528,"_source":{"id": "4","level": "second","titles": [{"value": "Bambi","type": "Educational","main": true},{"value": "The Baby Deer and the hunter (1942)","type": "Fantasy","main": false}]}},{"_index":"titles","_type":"movies","_id":"1","_score":0.9026783,"_source":{"id": "1","level": "first","titles": [{"value": "The Bad and the Beautiful","type": "Catalogue","main": true},{"value": "The Bad and the Beautiful (1945)","type": "International","main": false}]}}]}}
Update to latest question:
Well if you want to sort by level, you need to provide a mapping for elasticsearch. What I did:
Delete index
DELETE titles
Add mapping
PUT titles
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"movies": {
"properties": {
"level": {
"type": "keyword"
}
}
}
}
}
Refine Query DSL
GET titles/movies/_search
{
"_source": [
"id",
"level",
"titles.value"
],
"sort": [
{
"level": {
"order": "asc"
}
}
],
"query": {
"match": {
"titles.value": "The * the *"
}
}
}
That gives me
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "titles",
"_type": "movies",
"_id": "1",
"_score": null,
"_source": {
"level": "first",
"id": "1",
"titles": [
{
"value": "The Bad and the Beautiful"
},
{
"value": "The Bad and the Beautiful (1945)"
}
]
},
"sort": [
"first"
]
},
{
"_index": "titles",
"_type": "movies",
"_id": "4",
"_score": null,
"_source": {
"level": "second",
"id": "4",
"titles": [
{
"value": "Bambi"
},
{
"value": "The Baby Deer and the hunter (1942)"
}
]
},
"sort": [
"second"
]
}
]
}
}

Related

Querying array with nested objects in Elasticsearch to get multiple objects

I have data in Elasticsearch in the below format -
"segments": [
{"id": "ABC", "value":123},
{"id": "PQR", "value":345},
{"id": "DEF", "value":567},
{"id": "XYZ", "value":789},
]
I want to retrieve all segments where id is "ABC" or "DEF".
I looked up the docs (https://www.elastic.co/guide/en/elasticsearch/reference/7.9/query-dsl-nested-query.html) and few examples on YouTube but the all look to retrieve only a single object while I want to retrieve more than 1.
Is there a way to do this?
You can use nested query with inner hits as shown here.
I hope your index mapping is looks like below and segments field is define as nested
"mappings": {
"properties": {
"segments": {
"type": "nested",
"properties": {
"id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "long"
}
}
}
}
}
You can use below Query:
{
"_source" : false,
"query": {
"nested": {
"path": "segments",
"query": {
"terms": {
"segments.id.keyword": [
"ABC",
"DEF"
]
}
},
"inner_hits": {}
}
}
}
Response:
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "73895503",
"_id": "TmM8iYMBrWOLJcwdvQGG",
"_score": 1,
"inner_hits": {
"segments": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "73895503",
"_id": "TmM8iYMBrWOLJcwdvQGG",
"_nested": {
"field": "segments",
"offset": 0
},
"_score": 1,
"_source": {
"id": "ABC",
"value": 123
}
},
{
"_index": "73895503",
"_id": "TmM8iYMBrWOLJcwdvQGG",
"_nested": {
"field": "segments",
"offset": 2
},
"_score": 1,
"_source": {
"id": "DEF",
"value": 567
}
}
]
}
}
}
}
]
}

How to make aggregations work for text fields

I am trying to write a elasticsearch query to get unique locality towns. my locality_town_keyword is of keyword type. when I try to search into locality_town_keyword, I get search hits but nothing in "aggregations":"Buckets".
Following is how my schema looks like...
"locality_town": {
"type": "text"
},
"locality_town_keyword": {
"type": "keyword"
},
My Search query looks like following
{
"query":
{
"prefix" : { "locality_town" : "m" }
},
"size": "1",
"_source": {
"includes": [
"locality_town*"
]
},
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword",
"size": 5,
"order": {
"_count": "desc"
}
}
}
}
}
Here is the output it gives
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 799,
"max_score": 1.0,
"hits": [
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_score": 1.0,
"_source": {
"locality_town": "Manchester",
"locality_town_keyword": "Manchester"
}
}
]
},
"aggregations": {
"loc": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
This is how one document looks like
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_version": 1,
"_seq_no": 39,
"_primary_term": 1,
"found": true,
"_source": {
"title": "Legal Services",
"buyers": "CENTRAL MANCHESTER UNIVERSITY HOSPITALS NHS FOUNDATION TRUST",
"postal_code": "M13 0JR",
"publish_date": "2015-03-03T15:48:45Z",
"status": "cancelled",
"start_date": "2017-03-03T00:00:00Z",
"endt_date": "2020-03-03T00:00:00Z",
"url": "https://www.temp.com",
"country": "England",
"description": "desc......",
"language": "en-GB",
"service": "OPEN_CONTRACTING",
"value": "0",
"value_currency": "GBP",
"winner": "",
"create_time": "2019-05-11T21:39:42Z",
"deadline_date": "1970-01-01T00:00:00Z",
"address": "Central Manchester University Hospitals NHS Foundation Trust Wilmslow Park",
"locality_town": "Manchester",
"locality_town_keyword": "Manchester",
"region": "North West",
"tender_type": "planning",
"cpv": "Health services ",
"strpublish_date": "2015-03-03T15:48:45Z",
"strstart_date": "2017-03-03T00:00:00Z",
"strend_date": "2020-03-03T00:00:00Z",
"strdeadline_date": "",
"winner_email": "",
"winner_address": "",
"winner_town": "",
"winner_postalcode": "",
"winner_phone": "",
"cpvs": "[\"Health services (85100000-0)\"]"
}
}
Looks like you have a typo in your aggregation query:
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword", <== here
"size": 5,
Try with locality_town_keyword instead!
Hope this helps!

Elasticsearch: search score puzzle me. Same score for different match levels

To simplify:
PUT /test/vendors/1
{
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
PUT /test/vendors/2
{
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
PUT /test/vendors/3
{
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
Then search:
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ]
}
}
}
I understand why I get Jack who works in San Fran -- it's because he's a doctor too. However, I can't figure out why the match score is the SAME for him. The other two were matched with the place too, weren't they? why aren't Ron and Tom scored higher?
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.9245277,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}
Is there a way to force it to score less when less search keywords are found? Also, If I'n going to wrong way about this kind of search and there's a better pattern/way to do it -- I'd appreciate to be pointed in the right direction.
Your search structure is incorrect. The search query above is ignoring the place property and that's why you get the same score for all documents (only type property is taken into account). The reason for that is because works_at is a nested mapping, which should be treated differently when searching.
First, you should defined works_at as a nested mapping (read more here). Then you'll have to adjust your query to work with that nested mapping, see an example here.
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ],
"type": "most_fields" . <---- I WAS MISSING THIS
}
}
}
once in, that gave the correct results, where the "San Fran" guy is scored lower.
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.2122098,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}

How can I fetch all disctinct objects within a field over an elasticsearch index?

I am trying to get all distinct objects from within a field over the whole index.
What I tried so far is:
POST http://es5server:9200/indexname/_search
Content-Type: application/json
{
"size": "1",
"aggs": {
"tags": {
"terms": {
"field": "tags"
}
}
}
}
Currently this returns the following for me (I set size to 1 to include a sample document):
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.0,
"hits": [
{
"_index": "indexname",
"_type": "news",
"_id": "51",
"_score": 1.0,
"_source": {
"localized": {
"de": {
"title": null,
"shorttext": null,
"text": null
},
"en": {
"title": "test new title",
"shorttext": "hello my name is mayur and this is testnews text",
"text": null
}
},
"type": "object",
"key": "testnews",
"path": "\/",
"tags": {
"38": {
"name": "I AM",
"parent": "0"
},
"45": {
"name": "ffddd",
"parent": "43"
},
"43": {
"name": "kkjjttdd",
"parent": "0"
}
}
}
}
]
},
"aggregations": {
"tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
The buckets are empty, is this because the tags field contains objects and not text?
How can I get ES to return all distinct objects within the tags fields of all documents?
Looks like you need to have nested objects for you solution. See this question answered here

Find all ID where ID are not in my blacklist

After many lectures , I cannot say if this kind of query is possible with elasticsearch , I found the "getting started" really excellent but the rest of guide have a lack of examples (from my point of vue ).
See my structure below, I need to retrieve all id who are not in my blacklist. My blacklist is some reference id. For this example I am the id 1 with the firstname "me" . Here in the structure we see I blacklisted "bob" , so the bob id (2) is in my blacklist array because I don't want to find bob in my search result.. :)
Is it possible to only retrieve (dynamically for sure) all id who are not in my blacklist in one query?
If you come from SQL, the same logic could be :
SELECT id FROM index WHERE id NOT IN (SELECT * FROM blacklist WHERE id = 1)
I would like to avoid the 2 step query , if my schema is bad and should be reconsidered , please I'm totally open for advice or suggestions.
Here is the structure :
{
"id: 1,
"balance": 16623,
"firstname": "me",
"blacklist" : [2,1982,939,1982,98716,7611,983838, and thousands others ....],
}
{
"id: 2,
"balance": 16623,
"firstname": "bob,
"blacklist" : [18,1982,939,1982,98716,7611,983838, and thousands others ....],
}
{
"id: 3,
"balance": 16623,
"firstname": "jhon",
"blacklist" : [18,1982,939,1982,98716,7611,983838, and thousands others ....],
}
You can use use a terms filter lookup together with a not filter as follows.
I set up the index with the three docs you have listed:
DELETE /test_index
PUT /test_index
PUT /test_index/doc/1
{
"id": 1,
"balance": 16623,
"firstname": "me",
"blacklist" : [2,1982,939,1982,98716,7611,983838]
}
PUT /test_index/doc/2
{
"id": 2,
"balance": 16623,
"firstname": "bob",
"blacklist" : [18,1982,939,1982,98716,7611,983838]
}
PUT /test_index/doc/3
{
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist" : [18,1982,939,1982,98716,7611,983838]
}
Then set up a query that filters out docs that are in the blacklist for "me":
POST /test_index/doc/_search
{
"filter": {
"not": {
"filter": {
"terms": {
"id": {
"index": "test_index",
"type": "doc",
"id": "1",
"path": "blacklist"
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"id": 1,
"balance": 16623,
"firstname": "me",
"blacklist": [2,1982,939,1982,98716,7611,983838]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist": [18,1982,939,1982,98716,7611,983838]
}
}
]
}
}
If you also want to filter out the user whose blacklist is being used, you can set up a slightly more complex filter using or:
POST /test_index/doc/_search
{
"filter": {
"not": {
"filter": {
"or": {
"filters": [
{
"terms": {
"id": {
"index": "test_index",
"type": "doc",
"id": "1",
"path": "blacklist"
}
}
},
{
"term": {
"id": "1"
}
}
]
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist": [18,1982,939,1982,98716,7611,983838]
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/0b6808414f9447d4f7d23eb4c0d3e937ec2ea4e7

Resources