Elastic Search Won't Match For Arrays - elasticsearch

I'm trying to search a document with the following structure:
{
"_index": "XXX",
"_type": "business",
"_id": "1252809",
"_score": 1,
"_source": {
"url": "http://Samuraijapanese.com",
"raw_name": "Samurai Restaurant",
"categories": [
{
"name": "Cafe"
},
{
"name": "Cajun Restaurant"
},
{
"name": "Candy Stores"
}
],
"location": {
"lat": "32.9948649",
"lon": "-117.2528171"
},
"address": "979 Lomas Santa Fe Dr",
"zip": "92075",
"phone": "8584810032",
"short_name": "samurai-restaurant",
"name": "Samurai Restaurant",
"apt": "",
"state": "CA",
"stdhours": "",
"city": "Solana Beach",
"hours": "",
"yelp": "",
"twitter": "",
"closed": 0
}
}
Searching it for url, raw_name, address, etc, all work, but searching the categories returns nothing. I'm trying to search like so: If I switch anything else in for categories.name it works:
"query": {
"filtered" : {
"filter" : {
"geo_distance" : {
"location" : {
"lon" : "-117.15726",
"lat" : "32.71533"
},
"distance" : "5mi"
}
},
"query" : {
"multi_match" : {
"query" : "Cafe",
"fields" : [
"categories.name"
]
}
}
}
},
"sort": [
{
"_score" : {
"order" : "desc"
}
},
{
"_geo_distance": {
"location": {
"lat": 32.71533,
"lon": -117.15726
},
"order": "asc",
"sort_mode": "min"
}
}
],
"script_fields": {
"distance_from_origin": {
"script": "doc['location'].arcDistanceInKm(32.71533,-117.15726)"
}
},
"fields": ["_source"],
"from": 0,
"size": 10
}
If I switch out, for example, categories.name with address, and change the search term to Lomas, it returns the result

Without seeing your type mapping I can't answer definitively, but I would guess you have mapped categories as nested. When querying sub-documents of type nested (opposed to object) you have to use a nested query.

Related

Elasticsearch Normalised Score with Boost Documents

I am building a query that takes a set of codes and geo_point locations. The result should be a list of documents ordered by distance to origin. However, I would like to be normalised with let say a score of 10 for the document in the origin location and decreasing according to distance from origin. I have actually managed to build this search but I also would like to increase the score of a document if this includes an additional variable in the list of codes.
These are the requirements:
The output should be a list of documents which score is normalised according to distance from origin.
Documents returned should contain at least one yvar (i.e. yvar1 OR yvar2 OR yvar3 OR yvar...).
Only documents after a certain date should be returned
Only documents containing all the xvars passed to the query must be returned.
If a document has an additional x variable (e.g xvar4) the score for this document, should be increased by 0.1. This is the bit I am struggling with.
This is my mapping:
{
"mappings": {
"properties": {
"codes": {
"type": "keyword"
},
"date": {
"type": "date",
"format": "dd/MM/yyyy"
},
"coordinates": {"type": "geo_point"}
}
}
}
Some example documents (NB: The distanceToOrigin is for analysing the output only):
{ "create" : { "_index": "my-index", "_id" : "1" } }
{ "id": 1, "coordinates": { "lat": 51.5132, "lon": -0.1362}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 0 }
{ "create" : { "_index": "my-index", "_id" : "2" } }
{ "id": 2, "coordinates": { "lat": 52.9114, "lon": 0.5580}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "xvar4", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 114 }
{ "create" : { "_index": "my-index", "_id" : "3" } }
{ "id": 3, "coordinates": { "lat": 51.4890, "lon": -0.6029}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 22 }
{ "create" : { "_index": "my-index", "_id" : "4" } }
{ "id": 4, "coordinates": { "lat": 57.2555, "lon": -3.2692}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 530 }
My query which produces a normalised list of documents:
{
"query": {
"function_score": {
"query": { "match_all": {} },
"boost": "1",
"functions": [
{
"filter": [
{ "range": { "date":{ "gte": "01/11/2000" }}},
{ "terms_set": { "codes" : { "terms" : ["yvar1", "yvar2", "yvar3" ],
"minimum_should_match_script": { "source": "1" }}}}
],
"random_score": {},
"weight": 1
},
{
"filter": [
{ "terms_set": { "codes" : { "terms" : ["xvar1", "xvar2", "xvar3" ],
"minimum_should_match_script": { "source": "params.num_terms" }}}}
],
"weight": 1
},
{
"exp": {
"coordinates": {
"origin": "51.5132, -0.1362",
"offset": "0km",
"decay": 0.5,
"scale":"350km"}
},
"weight": 10
}
],
"max_boost": 10,
"score_mode": "max",
"boost_mode": "multiply"
}
}
}
This is what I tried as a query (substituting the match_all query) but does not work as I end up with a non-normalised list
"query": {
"bool": {
"should": [
{
"terms_set": { "codes" : { "terms" : ["xvar4"],
"minimum_should_match_script": { "source": "0" }, "boost" : 0.1}}
},
{
"match_all": {}
}
]
}
}
Any help for this ealsticsearch beginner will be greatly appreciated.
I found the solution by accessing the _score in a script_score query:
{
"query": {
"script_score": {
"query": {
"match": { "codes": "xvar4" }
},
"script": {
"source": "_score +0.1"
}
}
}
}

Should and Filter combination in ElasticSearch

I have this query which return the correct result
GET /person/_search
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"nameDetails.name.nameValue.surname": {
"value": "Pibba",
"fuzziness": "AUTO"
}
}
},
{
"fuzzy": {
"nameDetails.nameValue.firstName": {
"value": "Fawsu",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
and the result is below:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 3.6012557,
"hits" : [
{
"_index" : "person",
"_type" : "_doc",
"_id" : "70002",
"_score" : 3.6012557,
"_source" : {
"gender" : "Male",
"activeStatus" : "Inactive",
"deceased" : "No",
"nameDetails" : {
"name" : [
{
"nameValue" : {
"firstName" : "Fawsu",
"middleName" : "L.",
"surname" : "Pibba"
},
"nameType" : "Primary Name"
},
{
"nameValue" : {
"firstName" : "Fausu",
"middleName" : "L.",
"surname" : "Pibba"
},
"nameType" : "Spelling Variation"
}
]
}
}
}
]
}
But when I add the filter for Gender, it returns no result
GET /person/_search
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"nameDetails.name.nameValue.surname": {
"value": "Pibba",
"fuzziness": "AUTO"
}
}
},
{
"fuzzy": {
"nameDetails.nameValue.firstName": {
"value": "Fawsu",
"fuzziness": "AUTO"
}
}
}
],
"filter": [
{
"term": {
"gender": "Male"
}
}
]
}
}
}
Even I just use filter, it return no result
GET /person/_search
{
"query": {
"bool": {
"filter": [
{
"term": {
"gender": "Male"
}
}
]
}
}
}
You are not getting any search result, because you are using the term query (in the filter clause). Term query will return the document only if it has an exact match.
A standard analyzer is used when no analyzer is specified, which will tokenize Male to male. So either you can search for male instead of Male or use any of the below solutions.
If you have not defined any explicit index mapping, you need to add .keyword to the gender field. This uses the keyword analyzer instead of the standard analyzer (notice the ".keyword" after gender field). Try out this below query -
{
"query": {
"bool": {
"filter": [
{
"term": {
"gender.keyword": "Male"
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "66879128",
"_type": "_doc",
"_id": "1",
"_score": 0.0,
"_source": {
"gender": "Male",
"activeStatus": "Inactive",
"deceased": "No",
"nameDetails": {
"name": [
{
"nameValue": {
"firstName": "Fawsu",
"middleName": "L.",
"surname": "Pibba"
},
"nameType": "Primary Name"
},
{
"nameValue": {
"firstName": "Fausu",
"middleName": "L.",
"surname": "Pibba"
},
"nameType": "Spelling Variation"
}
]
}
}
}
]
If you have defined index mapping, then modify the mapping for gender field as shown below
{
"mappings": {
"properties": {
"gender": {
"type": "keyword"
}
}
}
}

How to add a user defined field and value to an elasticsearch query

Goal: I want a query which adds a discriminator field to distinguish between fuzzy results and non-fuzzy results.
Consider these documents:
curl -X POST "localhost:9200/_bulk" -H 'Content-Type: application/json' -d'
{
"index": {
"_index": "dishes",
"_type": "dish",
"_id": "1"
}
}
{
"name": "butter chicken"
}
{
"index": {
"_index": "dishes",
"_type": "dish",
"_id": "2"
}
}
{
"name": "chicken burger"
}
'
Consider the following query:
curl -X POST "localhost:9200/dishes/_search?pretty" -H 'Content-Type: application/json' -d'
{
"query": {
"bool": {
"should": [
{
"term": {
"name": "burger"
}
},
{
"fuzzy": {
"name": {
"value": "burger"
}
}
}
],
"minimum_should_match": 1,
"boost": 1.0
}
}
}
'
Can I have a result with an additional tag created during query (it is not in the document) that can be used to discriminate between what is a fuzzy result and what is a non-fuzzy result.
...
"hits" : [
{
"_index" : "dishes",
"_type" : "dish",
"_id" : "2",
"_score" : 1.3862942,
"_source" : {
"name" : "chicken burger"
},
"is_fuzzy": false
},
{
"_index" : "dishes",
"_type" : "dish",
"_id" : "1",
"_score" : 0.46209806,
"_source" : {
"name" : "butter chicken"
},
"is_fuzzy": true
}
]
Scripted fields could have been ideal. But no luck yet.
I have a requirement to present the non-fuzzy results before fuzzy results. So sorting on is_fuzzy and then _score is guaranteed to work. (The actual query is more complex.)
sort: [
{
"is_fuzzy": {
"order": "desc"
}
},
{
"_score": {
"order": "desc"
}
}
One more option is to use named queries but your term filters will need to be slightly reworked:
GET dishes/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"name": {
"value": "burger",
"_name": "not_fuzzy"
}
}
},
{
"fuzzy": {
"name": {
"value": "burger",
"_name": "fuzzy"
}
}
}
],
"minimum_should_match": 1,
"boost": 1
}
}
}
yielding
[
{
"_index":"dishes",
"_type":"dish",
"_id":"2",
"_score":1.3862944,
"_source":{
"name":"chicken burger"
},
"matched_queries":[ <---
"fuzzy",
"not_fuzzy"
]
},
{
"_index":"dishes",
"_type":"dish",
"_id":"1",
"_score":0.46209806,
"_source":{
"name":"butter chicken"
},
"matched_queries":[ <---
"fuzzy"
]
}
]

Wrong indexation elasticsearch using the analyser

I did a pretty simple test. I build a student index and a type, then I define a mapping:
POST student
{
"mappings" : {
"ing3" : {
"properties" : {
"quote": {
"type": "string",
"analyzer": "english"
}
}
}
}
}
After that I add 3 students to this index:
POST /student/ing3/1
{
"name": "Smith",
"first_name" : "John",
"quote" : "Learning is so cool!!"
}
POST /student/ing3/2
{
"name": "Roosevelt",
"first_name" : "Franklin",
"quote" : "I learn everyday"
}
POST /student/ing3/3
{
"name": "Black",
"first_name" : "Mike",
"quote" : "I learned a lot at school"
}
At this point I thought that the english tokeniser will tokenise all the word in my quotes so if I'm making a search like:
GET /etudiant/ing3/_search
{
"query" : {
"term" : { "quote" : "learn" }
}
}
I will have all the document as a result since my tokeniser will make equal "learn, learning, learned" and I was right. But when I try this request:
GET /student/ing3/_search
{
"query" : {
"term" : { "quote" : "learned" }
}
}
I got zero hit and in my opinion I should have the 3rd document (at least?). But for me Elasticsearch is also supposed to index learned and learning not only learn. Am I wrong? Is my request wrong?
If you check:
GET 'index/_analyze?field=quote' -d "I learned a lot at school"
you will see that your sentence is analyzed as:
{
"tokens":[
{
"token":"i",
"start_offset":0,
"end_offset":1,
"type":"<ALPHANUM>",
"position":0
},
{
"token":"learn",
"start_offset":2,
"end_offset":9,
"type":"<ALPHANUM>",
"position":1
},
{
"token":"lot",
"start_offset":12,
"end_offset":15,
"type":"<ALPHANUM>",
"position":3
},
{
"token":"school",
"start_offset":19,
"end_offset":25,
"type":"<ALPHANUM>",
"position":5
}
]
}
So english analyzer removes punctions and stop words and tokenize words in their root form.
https://www.elastic.co/guide/en/elasticsearch/guide/current/using-language-analyzers.html
You can use match query which will also analyze your search text so will match:
GET /etudiant/ing3/_search
{
"query" : {
"match" : { "quote" : "learned" }
}
}
There is another way. You can both stem the terms (the english analyzer does have a stemmer), but also keep the original terms, by using a keyword_repeat token filter and then using a unique token filter with "only_on_same_position": true to remove unnecessary duplicates after the stemming:
PUT student
{
"settings": {
"analysis": {
"analyzer": {
"myAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"keyword_repeat",
"english_stemmer",
"unique_stem"
]
}
},
"filter": {
"unique_stem": {
"type": "unique",
"only_on_same_position": true
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
}
}
},
"mappings": {
"ing3": {
"properties": {
"quote": {
"type": "string",
"analyzer": "myAnalyzer"
}
}
}
}
}
In this case the term query will work, as well. If you look at what terms are actually being indexed:
GET /student/_search
{
"fielddata_fields": ["quote"]
}
it will be clear why now it matches:
"hits": [
{
"_index": "student",
"_type": "ing3",
"_id": "2",
"_score": 1,
"_source": {
"name": "Roosevelt",
"first_name": "Franklin",
"quote": "I learn everyday"
},
"fields": {
"quote": [
"everydai",
"everyday",
"i",
"learn"
]
}
},
{
"_index": "student",
"_type": "ing3",
"_id": "1",
"_score": 1,
"_source": {
"name": "Smith",
"first_name": "John",
"quote": "Learning is so cool!!"
},
"fields": {
"quote": [
"cool",
"learn",
"learning",
"so"
]
}
},
{
"_index": "student",
"_type": "ing3",
"_id": "3",
"_score": 1,
"_source": {
"name": "Black",
"first_name": "Mike",
"quote": "I learned a lot at school"
},
"fields": {
"quote": [
"i",
"learn",
"learned",
"lot",
"school"
]
}
}
]

Elasticsearch - MySQL Index Search Distance Search

I am trying to use Elasticsearch indexed on a MySQL table to find all addresses that are within x km from a particular data point. I have indexed the table with the following:
{
"type": "jdbc",
"jdbc": {
"strategy": "simple",
"url": "jdbc:mysql://hostname/databasename",
"user": "username",
"password": "password",
"sql": "SELECT name,address1,city,state,zip,lat as `location.lat`,lng as `location.lon` FROM addresses",
"poll": "24h",
"max_retries": 3,
"max_retries_wait": "10s",
"index" : "teststores",
"type" : "providers"
},
"index": {
"index": "addressindex",
"autocommit": "true",
"type": "mysql",
"bulk_size": 100,
"type_mapping": {
"location_mapping" : {
"properties" : {
"pin" : {
"type" : "geo_point"
}
}
}
}
}
}
An example of the indexed data is the following:
"_index": "teststores",
"_type": "providers",
"_id": "Rue2Yxo7SSa_mi5-AzRycA",
"_score": 1,
"_source": {
"zip": "10003",
"name": "I Salon",
"state": "NY",
"address1": "150 East 14th Street",
"location":{
"lat": 40.7337,
"lon": -73.9881
},
"city": "New York"
}
I want to adjust the following query to use lat and lng for calculating the distance.
{
"query": {
"filtered": {
"query": {
"match_all": {
}
},
"filter": {
"geo_distance" : {
"distance" : "2km",
"pin.location" : {
"lat" : 40.686511,
"lon" : -73.986574
}
}
}
}
},
}
How can I adjust this to make the distance work and get all addresses within x kilometers?

Resources