How to search multiple fields and aggregate scores? - elasticsearch

I'm trying to figure out a solution to how I should structure my queries for finding answers to people's questions. For example, based off the dataset I will paste at the end of this post, I would like to query "Shows about romance", and maybe get results like so:
{
"hits": [
{
"_score": "31",
"_source": {
"anime": "Grisaia no Kajitsu"
}
},
{
"_score": "12",
"_source": {
"anime": "Mirai Nikki"
}
},
{
"_score": "7",
"_source": {
"anime": "Bakemonogatari"
}
}
]
}
Grisaia no Kajitsu shows up as the first result because it's shown in multiple relevant questions, and Mirai Nikki is second because it had a higher score than Bakemonogatari.
Basically I would like answers that are relevant based off the question, score, and tags field. Questions that have repeated answers should have a higher score. Any suggestions?
My dataset:
[
{
"question": "Looking for romance anime",
"score": 4,
"answers": [
{
"anime": "Mirai Nikki",
"score": 8,
"tags": ["action", "adventure", "death game", "romance"]
},
{
"anime": "Bakemonogatari",
"score": 3,
"tags": ["action", "comedy", "romance", "seinen"]
}
]
},
{
"question": "Survival Anime",
"score": 10,
"answers": [
{
"anime": "Grisaia no Kajitsu",
"score": 4,
"tags": ["school", "drama", "survival", "romance"]
},
{
"anime": "Kanata no Astra",
"score": 7,
"tags": ["action", "comedy", "drama", "space"]
}
]
},
{
"question": "Horror and romance anime?",
"score": 12,
"answers": [
{
"anime": "Grisaia no Kajitsu",
"score": 15,
"tags": ["school", "drama", "survival", "romance"]
}
]
}
]

This should work for you, you can tune various boost params around here and see how it affects your results
{
"_source": ["answers.anime"],
"query": {
"bool": {
"should": [
{
"term": {
"answers.tags": {
"value": "Shows about romance",
"boost": 2 //weight of tags field
}
}
},
{
"match": {
"question": {
"query": "Shows about romance",
"boost": 2 //weight of question field
}
}
},
{
"function_score": {
"min_score": 0.9,
"functions": [
{
"field_value_factor": {
"factor": 1, //weight of score field
"field": "answers.score",
"modifier": "log2p"
}
}
]
}
}
]
}
}
}

Related

Cannot seem to use must and must_not together in an elastic search query

If I run the following query:
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "boxing",
"fuzziness": 2,
"minimum_should_match": 2
}
}
],
"must_not": [
{
"terms_set": {
"allowedCountries": {
"terms": ["gb", "mx"],
"minimum_should_match_script": {
"source": "2"
}
}
}
}
],
"filter": [
{
"range": {
"expireTime": {
"gt": 1674061907954
}
}
},
{
"term": {
"region": {
"value": "row"
}
}
},
{
"term": {
"sourceType": {
"value": "article"
}
}
}
]
}
}
}
against an index with articles that look like:
{
"_index": "content-items-v10",
"_type": "_doc",
"_id": "e7hm75ui4dma1mm4j8q5v7914",
"_score": 4.3724976,
"_source": {
"allowedCountries": ["gb", "ie"],
"body": "Both Joshua Buatsi and Craig Richards join The DAZN Boxing Show ahead of their clash at London's O2 Arena. Matchroom's Eddie Hearn also gives his take on the night, as well as Chantelle Cameron previewing her contest with Victoria Noelia Bustos.",
"competitions": [
{
"id": "8lo6205qyio0fksjx9glqbdhj",
"name": "Buatsi v Richards"
}
],
"contestants": [
{
"id": "7rq59j3eiamxlm12vhxcsgujj",
"name": "Joshua Buatsi"
},
{
"id": "boby9oqe23g6qyuwphrxh8su5",
"name": "Craig Richards"
}
],
"countries": [
{
"id": "7yasa43laq1nb2e6f8bfuvxed",
"name": "World"
},
{
"id": "258l9t5sm55592i08mdpqzr3t",
"name": "United Kingdom"
}
],
"dotsLastUpdateTime": 1673979749396,
"expireTime": 4800000000000,
"fixtureDate": {},
"headline": "Buatsi vs. Richards: Preview",
"id": "e7hm75ui4dma1mm4j8q5v7914",
"importance": 0,
"languageKeys": ["en"],
"languages": ["en"],
"lastUpdateTime": {
"ts": 1653088281000,
"iso8601": "2022-05-20T23:11:21.000Z"
},
"promoImageUrl": null,
"publication": {
"typeId": "1plcw0iyhx9vn1fcanbm2ja3rf",
"typeName": "Shoulder"
},
"publishedTime": {
"ts": 1653088281000,
"iso8601": "2022-05-20T23:11:21.000Z"
},
"region": "row",
"shortHeadline": null,
"sourceType": "article",
"sports": [
{
"id": "2x2oqzx60orpoeugkd754ga17",
"name": "Boxing"
}
],
"teaser": "",
"thumbnailImageUrl": "https://images.daznservices.com/di/library/babcock_canada/45/3e/the-dazn-boxing-show-20052022_xc4jbfqi022l1shq9lu641h9e.png?t=-477976832",
"translations": {}
}
}
I get the following validation error from elasticsearch:
{
"ok": false,
"errors": {
"validation": [
{
"message": "\"query.bool.must_not\" is not allowed",
"path": [
"query",
"bool",
"must_not"
],
"type": "object.unknown",
"context": {
"child": "must_not",
"label": "query.bool.must_not",
"value": [
{
"terms_set": {
"allowedCountries": {
"terms": [
"gb",
"mx"
],
"minimum_should_match_script": {
"source": "2"
}
}
}
}
],
"key": "must_not"
}
}
]
},
"correlationId": "d29e9275-9ab3-4ff8-944d-852b98d4b503"
}
And I cannot figure out what the issue might be! From the elastic docs it should be OK.
I'm using ElasticSearch 7.9.3 running in a local docker container.
I'm hoping someone out there will give me a clue!
Cheers!
I would expect this to just work.
I'm trying to filter out articles that have both of the country codes gb and mx in the field allowedCountries.
I can include them easily enough in the results when I add the terms_set query to the bool.must section of the query.
It works well, you just need to enclose your query in the query section
{
"query": { <--- add this
"bool": { <--- your query starts here
"must": [
...
Thank you for responding!
I was helping with a system I did not have full context on - it turns out there is a proxy in the mix with validation that was blocking the must_not query. So, with the proxy fixed, it now works.

How to combine simplequerystring with bool/must

I have this ElasticSearch query for ES version 7:
{
"from": 0,
"simple_query_string": {
"query": "*"
},
"query": {
"bool": {
"must": [
{
"term": {
"organization_id": "fred"
}
},
{
"term": {
"assigned_user_id": "24584080"
}
}
]
}
},
"size": 50,
"sort": {
"updated": "desc"
},
"terminate_after": 50,
}
but ES gives me back this error:
reason: Unknown key for a START_OBJECT in [simple_query_string]
my goal is to be able to use a query-string for multiple fields, and also use term/match with bool/must. Should I abandon the query string and just use bool.must[{match:"my query"}]?
You can use bool to combine multiple queries in this way. The must clause will work as logical AND, and will make sure all the conditions are matched.
You need to include the simple_query_string inside the query section
Adding Working example with sample docs, and search query.
Index Sample Data
{
"organization_id": 1,
"assigned_user_id": 2,
"title": "welcome"
}{
"organization_id": 2,
"assigned_user_id": 21,
"title": "hello"
}{
"organization_id": 3,
"assigned_user_id": 22,
"title": "hello welocome"
}
Search Query :
{
"query": {
"bool": {
"must": [
{
"simple_query_string": {
"fields" : ["title"],
"query" : "welcome"
}
},
{
"match": {
"organization_id": "1"
}
},
{
"match": {
"assigned_user_id": "2"
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "my_index",
"_type": "_doc",
"_id": "1",
"_score": 3.0925694,
"_source": {
"organization_id": 1,
"assigned_user_id": 2,
"title": "welcome"
}
}
]

Elasticsearch - How does one combine term suggestions from multiple fields?

The term suggester documentation lays out the basics of term suggester, but it leaves me wondering how I can find suggestions from multiple fields and combine them. I can probably come up with some implementation after-the-fact, but I'm wondering if there are some settings I'm missing.
For example, let's say I want to get suggestions from three different fields
GET product-search-product/_search
{
"suggest": {
"text": "som typu here",
"my-suggest-1": {
"term": {
"size": 1,
"max_edits": 1,
"prefix_length": 3,
"field": "field_one"
}
},
"my-suggest-2": {
"term": {
"size": 1,
"max_edits": 1,
"prefix_length": 3,
"field": "field_two"
}
},
"my-suggest-3": {
"term": {
"size": 1,
"max_edits": 1,
"prefix_length": 3,
"field": "field_three"
}
}
}
}
This returns results I can use, but I have to figure out which field had the "best" suggestion.
"suggest": {
"my-suggest-1": [
{
"text": "som",
...
"options": [
{
"text": "somi"
...
}
]
},
{
"text": "typu",
...
"options": [
{
"text": "typo"
...
}
]
},
{
"text": "here",
...
"options": []
}
],
"my-suggest-2": [
{
"text": "som",
...
"options": [
{
"text": "some"
...
}
]
},
{
"text": "typu",
...
"options": []
},
{
"text": "here",
...
"options": []
}
],
"my-suggest-3": [
{
"text": "som",
...
"options": []
},
{
"text": "typu",
...
"options": [
{
"text": "typa"
...
}
]
},
{
"text": "here",
...
"options": []
}
]
}
It looks to me as if I have to implement something to determine which field came up with the best suggestions. Is there no way to combine these in the suggester so it can do that for me?
Phrase suggester was appropriate for my case and with the phrase suggester there exist candidate generators which appear to solve my problem.

Fuzzy search in the elasticsearch gives matches with incorrect order

I am trying to build an engine where we can match areas mentioned in the address with the list available in elasticsearch.
I am using this query to search areas similar to "iit".
My query is :
{
"query": {
"fuzzy": {
"locality": {
"value": "iit",
"fuzziness": 1
}
}
},
"highlight": {
"fields": {
"locality": {}
}
}
}
I am getting below results :
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 2.1290483,
"hits": [
{
"_index": "geocoding_1",
"_type": "localities",
"_id": "AVuzRiZ04pEQsZFpK6A_",
"_score": 2.1290483,
"_source": {
"locality": [
"emerald isle ii"
]
},
"highlight": {
"locality": [
"emerald isle <em>ii</em>"
]
}
},
{
"_index": "geocoding_1",
"_type": "localities",
"_id": "AVuzRfof4pEQsZFpK59H",
"_score": 1.877402,
"_source": {
"locality": [
"iit - bombay",
"iitb",
"indian institute of technology - bombay"
]
},
"highlight": {
"locality": [
"<em>iit</em> - bombay",
"<em>iitb</em>"
]
}
}
]
}
}
Because "iit" is directly available in the 2nd document and hence I was expecting that to be returned as best match with highest score. What is the change that I should make so that I get 2nd document with highest score.
I am using ES 2.3.4 .
If you also are interested in exact matching to score better, I always suggest a bool with should statements and adding a match or term query in there. In this way the combined scores will favor the exact matching:
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"locality": {
"value": "iit",
"fuzziness": 1
}
}
},
{
"match": {
"locality": "iit"
}
}
]
}
},
"highlight": {
"fields": {
"locality": {}
}
}
}

How to get total score specific to each row

I need, Elasticsearch GET query to view the total score of each and every students by adding up the marks earned by them in all the subject rather I am getting total score of all the students in every subject.
GET /testindex/testindex/_search
{
"query" : {
"filtered" : {
"query" : {
"match_all" : {}
}
}
},
"aggs": {
"total": {
"sum": {
"script" : "doc['physics'].value + doc['maths'].value + doc['chemistry'].value"
}
}
}
}
Output
{
....
"hits": [
{
"_index": "testindex",
"_type": "testindex",
"_id": "1",
"_score": 1,
"_source": {
"personalDetails": {
"name": "viswa",
"age": "33"
},
"marks": {
"physics": 18,
"maths": 5,
"chemistry": 34
},
"remarks": [
"hard working",
"intelligent"
]
}
},
{
"_index": "testindex",
"_type": "testindex",
"_id": "2",
"_score": 1,
"_source": {
"personalDetails": {
"name": "bob",
"age": "13"
},
"marks": {
"physics": 48,
"maths": 45,
"chemistry": 44
},
"remarks": [
"hard working",
"intelligent"
]
}
}
]
},
"aggregations": {
"total": {
"value": 194
}
}
}
Expected Output:
I would like to get total mark earned in subjects of each and every student rather than total of all the students.
What changes I need to do in the query to achieve this.
{
"query": {
"filtered": {
"query": {
"match_all": {}
}
}
},
"aggs": {
"student": {
"terms": {
"field": "personalDetails.name",
"size": 10
},
"aggs": {
"total": {
"sum": {
"script": "doc['physics'].value + doc['maths'].value + doc['chemistry'].value"
}
}
}
}
}
}
But, be careful, for student terms aggregation you need a "unique" (something that makes that student unique - like a personal ID or something) field, maybe the _id itself, but you need to store it.

Resources