Elastic Search getting the latest of each document by ID and language - elasticsearch

My company has a CMS and the visits to its content is logged to Elastic Search, each entry is a document like this:
{
"datetime": "2020-08-02T17:12:54.012+01:00",
"title" : "title of the content",
"LocalNumber" : "000025",
"CodeLanguage" : "eng"
}
There are more but these are the important ones. The title property changes over time meaning that the ID (the LocalNumber) is the same but the title may be different.
I'm trying to make an ES query to retrieve just the lastest document by LocalNumber and CodeLanguage. I could achieve doing it by LocalNumber, using field collapse:
{
"query" : {
"match_all" : {}
},
"collapse": {
"field": "LocalNumber.keyword"
},
"sort": [{"datetime": "desc"}]
}
How should I change this query to be able to get it by LocalNumber and CodeLanguage?

Looks like you may just use second level of collapsing in collapse:
{
"query": {
"match_all": {}
},
"collapse": {
"field": "LocalNumber.keyword",
"inner_hits": {
"name": "by lang",
"collapse": {
"field": "CodeLanguage.keyword"
},
"size": 3
}
},
"sort": [
{
"datetime": "desc"
}
]
}
This will return:
1 document per LocalNumber.keyword, most recent by datetime first (your original query);
per each such document, up to 3 inner_hits documents per CodeLanguage.keyword.
With my test data it returns a result like the following:
{
"hits": {
"hits": [
{
"_index": "myindex2",
"_type": "_doc",
"_id": "2",
"_score": null,
"_source": {
"datetime": "2020-08-02T17:12:54.012+01:00",
"title": "title of the content 2",
"LocalNumber": "000025",
"CodeLanguage": "fr"
},
"fields": {
"LocalNumber.keyword": [
"000025"
]
},
"sort": [
1596384774012
],
"inner_hits": {
"by lang": {
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "myindex2",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"datetime": "2020-08-02T17:12:54.012+01:00",
"title": "title of the content 2",
"LocalNumber": "000025",
"CodeLanguage": "fr"
},
"fields": {
"CodeLanguage.keyword": [
"fr"
]
}
},
{
"_index": "myindex2",
"_type": "_doc",
"_id": "10",
"_score": 1.0,
"_source": {
"datetime": "2020-08-01T17:12:54.012+01:00",
"title": "title of the content 10",
"LocalNumber": "000025",
"CodeLanguage": "eng"
},
"fields": {
"CodeLanguage.keyword": [
"eng"
]
}
}
]
}
}
}
}
]
}
}
I believe this is just a syntactic sugar for normal Elasticsearch aggregations, like the one joe posted above. So underneath ES will be doing the same thing, but this query is smaller and probably easier to understand.

You may as well aggregate by those two fields separately & then use top_hits to get only the latest doc:
{
"size": 0,
"aggs": {
"by_LocalNumber": {
"terms": {
"field": "LocalNumber.keyword"
},
"aggs": {
"latest": {
"top_hits": {
"size": 1,
"sort": {
"datetime": {
"order": "desc"
}
}
}
}
}
},
"by_CodeLanguage": {
"terms": {
"field": "CodeLanguage.keyword"
},
"aggs": {
"latest": {
"top_hits": {
"size": 1,
"sort": {
"datetime": {
"order": "desc"
}
}
}
}
}
}
}
}
And if you're interested in a subset of LocalNumber & CodeLanguage, you can constrain them in a should query.

Related

Elasticsearch - Is it possible to collapse first then aggregate data of a nested field?

I am using Elasticsearch and I want to group our results by a specific field, returning top n documents per group. The document have a nested filed and I want to aggregate all the documents' nested field for each group.
Example
I have 5 documents and each have a groupId and also a nested field peoples. I want group these documents by the groupId. And then for each group, I want to get top 2 people(some documents may contain same people).
PUT test/_mapping
{
"properties": {
"groupId":{
"type":"keyword"
},
"id":{
"type":"keyword"
},
"name":{
"type":"text"
},
"people":{
"type":"nested",
"properties":{
"email":{
"type":"keyword"
}
}
}
}
}
PUT test/_doc/1
{
"name": "docs1",
"groupId": "1",
"people":[{
"email":"people1#test.com"
}]
}
PUT test/_doc/2
{
"name": "docs2",
"groupId": "1",
"people":[{
"email":"people2.1#test.com"
},
{
"email":"people2.2#test.com"
}]
}
PUT test/_doc/3
{
"name": "docs3",
"groupId": "2",
"people":[{
"email":"people3.1#test.com"
},
{
"email":"people2.2#test.com"
}]
}
PUT test/_doc/4
{
"name": "docs4",
"groupId": "1",
"people":[{
"email":"people4.1#test.com"
},
{
"email":"people4.2#test.com"
}]
}
PUT test/_doc/5
{
"name": "docs5",
"groupId": "3",
"people":[{
"email":"people5.1#test.com"
},
{
"email":"people5.2#test.com"
}]
}
Search query
GET test/_search
{
"collapse": {
"field": "groupId",
"inner_hits": {
"name":"inner",
"size": 2
}
},
"sort": [
{
"groupId": {
"order": "asc"
}
}
],
"size": 2,
"from": 0
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "test",
"_id": "1",
"_score": null,
"_source": {
"name": "docs1",
"groupId": "1",
"people": [
{
"email": "people1#test.com"
}
]
},
"fields": {
"groupId": [
"1"
]
},
"sort": [
"1"
],
"inner_hits": {
"inner": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 0,
"hits": [
{
"_index": "test",
"_id": "1",
"_score": 0,
"_source": {
"name": "docs1",
"groupId": "1",
"people": [
{
"email": "people1#test.com"
}
]
}
},
{
"_index": "test",
"_id": "2",
"_score": 0,
"_source": {
"name": "docs2",
"groupId": "1",
"people": [
{
"email": "people2.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
}
}
]
}
}
}
},
{
"_index": "test",
"_id": "3",
"_score": null,
"_source": {
"name": "docs3",
"groupId": "2",
"people": [
{
"email": "people3.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
},
"fields": {
"groupId": [
"2"
]
},
"sort": [
"2"
],
"inner_hits": {
"inner": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0,
"hits": [
{
"_index": "test",
"_id": "3",
"_score": 0,
"_source": {
"name": "docs3",
"groupId": "2",
"people": [
{
"email": "people3.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
}
}
]
}
}
}
}
]
}
}
Expecting is to aggregate a groupPeople field for each group and it contains top n people of that group(should not affected by the inner_hit size, like for groupId=1, it contains 3 documents and 5 people).
The query that you're looking for is this one:
POST test/_search
{
"size": 0,
"aggs": {
"groups": {
"terms": {
"field": "groupId",
"size": 10
},
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"emails": {
"terms": {
"field": "people.email",
"size": 2
}
}
}
}
}
}
}
}
If you need pagination, you can achieve the same using the composite aggregation:
POST test/_search
{
"size": 0,
"aggs": {
"pages": {
"composite": {
"sources": [
{
"groups": {
"terms": {
"field": "groupId"
}
}
}
]
},
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"emails": {
"terms": {
"field": "people.email",
"size": 2
}
}
}
}
}
}
}
}

How can I search and return only nested documents in ElasticSearch?

Say I have the following author documents, each with book documents:
[
{
"name": "Foo McBarrington",
"books": [
{
"title": "Foo Book",
"published": "2019-05-02"
},
{
"title": "Bar Book",
"published": "2021-06-13"
}
]
},
{
"name": "Bar McFooington",
"books": [
{
"title": "Baz Book",
"published": "2020-06-23"
}
]
}
]
I would like to search for and return books, ignoring anything from the author documents, and sort the books by published field on each book. I should be able to get the books sorted relative to each other:
[
{
"title": "Foo Book",
"published": "2019-05-02"
},
{
"title": "Baz Book",
"published": "2020-06-23"
},
{
"title": "Bar Book",
"published": "2021-06-13"
}
]
Notice that the book from the second author is sorted in the middle of the two books from the first author.
Is this possible with ElasticSearch? So far I've tried using the top_hits aggregation in a nested aggregation but I'm not sure why it's not working.
Yes, it is possible. You can use a combination of nested aggregation, terms aggregation, and top hits aggregation to achieve your result
{
"size": 0,
"aggs": {
"resellers": {
"nested": {
"path": "books"
},
"aggs": {
"books": {
"terms": {
"field": "books.title.keyword"
},
"aggs": {
"latest_books": {
"top_hits": {
"sort": [
{
"books.published": {
"order": "asc"
}
}
],
"_source": {
"includes": [
"books.title",
"books.published"
]
},
"size": 1
}
}
}
}
}
}
}
}
The search result will be
"aggregations": {
"resellers": {
"doc_count": 3,
"books": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Bar Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "books",
"offset": 1
},
"_score": null,
"_source": {
"published": "2021-06-13", // note this
"title": "Bar Book"
},
"sort": [
1623542400000
]
}
]
}
}
},
{
"key": "Baz Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "books",
"offset": 0
},
"_score": null,
"_source": {
"published": "2020-06-23", // note this
"title": "Baz Book"
},
"sort": [
1592870400000
]
}
]
}
}
},
{
"key": "Foo Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "books",
"offset": 0
},
"_score": null,
"_source": {
"published": "2019-05-02", // note this
"title": "Foo Book"
},
"sort": [
1556755200000
]
}
]
}
}
}
]
}
}
}
I found a way to get the books in a paginated format using a composite aggregation:
{
"size": 0,
"aggs": {
"authors": {
"nested": {
"path": "books"
},
"aggs": {
"books_composite": {
"composite": {
"size": 25,
"sources": [
{
"published": {
"terms": {
"field": "books.published",
"order": "desc"
}
}
}
]
}
}
}
}
}
}
To get the following page, specify after:
I found a way to get the books in a paginated format using a [composite aggregation][1]:
```json
{
"size": 0,
"aggs": {
"authors": {
"nested": {
"path": "books"
},
"aggs": {
"books_composite": {
"composite": {
"size": 25,
"after": {
"published": "<DATE HERE>",
"order": "desc"
},
"sources": [
{
"published": {
"terms": {
"field": "books.published",
"order": "desc"
}
}
}
]
}
}
}
}
}
}

Search all unique terms from a given query in elastic search

I am trying to search for all the unique names in the index test_nested.
GET test_nested/_mappings
{
"test_nested": {
"mappings": {
"my_type": {
"properties": {
"group": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"user": {
"type": "nested",
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
}
GET test_nested/_search
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "test_nested",
"_type": "my_type",
"_id": "AWG5iVBz4bQsVnslc9gL",
"_score": 1,
"_source": {
"group": "fans",
"user": [
{
"name": "Linux"
},
{
"name": "Android (operating system)"
},
{
"name": "Widows 10"
}
]
}
},
{
"_index": "test_nested",
"_type": "my_type",
"_id": "AWG5ieKW4bQsVnslc9gM",
"_score": 1,
"_source": {
"group": "fans",
"user": [
{
"name": "Bitcoin"
},
{
"name": "PHP"
},
{
"name": "Microsoft Windows"
}
]
}
},
{
"_index": "test_nested",
"_type": "my_type",
"_id": "AWG5irrV4bQsVnslc9gN",
"_score": 1,
"_source": {
"group": "fans",
"user": [
{
"name": "Windows XP"
}
]
}
},
{
"_index": "test_nested",
"_type": "my_type",
"_id": "1",
"_score": 1,
"_source": {
"group": "fans",
"user": [
{
"name": "iOS"
},
{
"name": "Android (operating system)"
},
{
"name": "Widows 10"
},
{
"name": "Widows XP"
}
]
}
}
]
}
}
I want all the unique names for a term. i.e. if I search for "wi"* then I should get [Microsoft Windows, Widows 10, Windows XP]
I don't know exactly what you mean but I use that query to list all statuses:
GET order/default/_search
{
"size": 0,
"aggs": {
"status_terms": {
"terms": {
"field": "status.keyword",
"missing": "N/A",
"min_doc_count": 0,
"order": {
"_key": "asc"
}
}
}
}
}
My model has status field and that query lists all statuses.
This is bucket aggregations
One of fields in result is:
sum_other_doc_count - Elastic returns the top unique terms. So if you have many different terms then some of them will not appear in the results. This field is a sum of documents which will not be a part of the response.
For nested objects try to read and use Nested Query docs
I found the solution. Hope it helps someone.
GET record_new/_search
{
"size": 0,
"query": {
"term": {
"software_tags": {
"value": "windows"
}
}
},
"aggs": {
"software_tags": {
"terms": {
"field": "software_tags.keyword",
"include" : ".*Windows.*",
"size": 10000,
"order": {
"_count": "desc"
}
}
}
}
}

How do I perform an "OR" filter on an aggregate?

I am trying to grab the first 10 documents grouped by domain. These 10 documents need to have a value for "crawl_date" that haven't been crawled for a while or haven't been crawled at all (eg a blank value). I have:
curl -XPOST 'http://localhost:9200/tester/test/_search' -d '
{
"size": 10,
"aggs": {
"group_by_domain": {
"filter": {
"or":[
"term": {"crawl_date": ""},
"term": {"crawl_date": ""} // how do I put a range here? e.g. <= '2014-12-31'
]
},
"terms": {
"field": "domain"
}
}
}
}'
I am new to ES and using version 2.2. Since the documentation isn't fully updated I am struggling.
EDIT:
To clarify, I need 10 urls that haven't been crawled or haven't been crawled for a while. Each of those 10 urls has to come from a unique domain so that when I crawl them I don't overload someone's server.
Another Edit:
So, I need something like this (1 link for each of 10 unique domains):
1. www.domain1.com/page
2. www.domain2.com/url
etc...
Instead, I am getting just the domain and the number of pages:
"buckets": [
{
"key": "http://www.dailymail.co.uk",
"doc_count": 212
},
{
"key": "https://sedo.com",
"doc_count": 196
},
{
"key": "http://www.foxnews.com",
"doc_count": 118
},
{
"key": "http://data.worldbank.org",
"doc_count": 117
},
{
"key": "http://detail.1688.com",
"doc_count": 117
},
{
"key": "https://twitter.com",
"doc_count": 112
},
{
"key": "http://search.rakuten.co.jp",
"doc_count": 104
},
{
"key": "https://in.1688.com",
"doc_count": 92
},
{
"key": "http://www.abc.net.au",
"doc_count": 87
},
{
"key": "http://sport.lemonde.fr",
"doc_count": 85
}
]
The "hits" returns multiple pages for just 1 domain:
"hits": [
{
"_index": "tester",
"_type": "test",
"_id": "http://www.barnesandnoble.com/w/at-the-edge-of-the-orchard-tracy-chevalier/1121908441?ean=9780525953005",
"_score": 1,
"_source": {
"domain": "http://www.barnesandnoble.com",
"crawl_date": "0001-01-01T00:00:00Z"
}
},
{
"_index": "tester",
"_type": "test",
"_id": "http://www.barnesandnoble.com/b/bargain-books/_/N-8qb",
"_score": 1,
"_source": {
"domain": "http://www.barnesandnoble.com",
"crawl_date": "0001-01-01T00:00:00Z"
}
},
etc....
Barnes and Noble will quickly ban my UA if I try to crawl that many domains at the same time.
I need something like this:
1. "http://www.dailymail.co.uk/page/text.html",
2. "https://sedo.com/another/page"
3. "http://www.barnesandnoble.com/b/bargain-books/_/N-8qb"
4. "http://www.starbucks.com/homepage/"
etc.
Using Aggregations
If you want to use aggregations, I'd suggest using the terms aggregations to remove the duplicates from your result set and as sub aggregation, I'd use the top_hits aggregation, which gives you the best hit from the aggregated documents of each domain (per default the score for each document within a domain should be the same.)
Consequently the query will look like that:
POST sites/page/_search
{
"size": 0,
"aggs": {
"filtered_domains": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2016-01-01"
}
}
}
]
}
},
"aggs": {
"domains": {
"terms": {
"field": "domain",
"size": 10
},
"aggs": {
"pages": {
"top_hits": {
"size": 1
}
}
}
}
}
}
}
}
Giving you results like that
"aggregations": {
"filtered_domains": {
"doc_count": 3,
"domains": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "barnesandnoble.com",
"doc_count": 2,
"pages": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "page",
"_id": "barnesandnoble.com/test2.html",
"_score": 1,
"_source": {
"crawl_date": "1982-05-16",
"domain": "barnesandnoble.com"
}
}
]
}
}
},
{
"key": "starbucks.com",
"doc_count": 1,
"pages": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "page",
"_id": "starbucks.com/index.html",
"_score": 1,
"_source": {
"crawl_date": "1982-05-16",
"domain": "starbucks.com"
}
}
]
}
}
}
]
}
}
Using Parent/Child Aggregations
If you can change your index structure, I'd suggest to create an index with either parent/child relationship or nested documents.
If you do so, you can select 10 distinct domains and retrieve one (or more) specific pages of this url.
Let me show you an example with parent/child (if you use sense, you should be able to just copy paste):
First generate the mappings for the documents:
PUT /sites
{
"mappings": {
"domain": {},
"page": {
"_parent": {
"type": "domain"
},
"properties": {
"crawl_date": {
"type": "date"
}
}
}
}
}
Insert some documents
PUT sites/domain/barnesandnoble.com
{}
PUT sites/domain/starbucks.com
{}
PUT sites/domain/dailymail.co.uk
{}
POST /sites/page/_bulk
{ "index": { "_id": "barnesandnoble.com/test.html", "parent": "barnesandnoble.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "barnesandnoble.com/test2.html", "parent": "barnesandnoble.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "starbucks.com/index.html", "parent": "starbucks.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "dailymail.co.uk/index.html", "parent": "dailymail.co.uk" }}
{}
Search for the urls to crawl
POST /sites/domain/_search
{
"query": {
"has_child": {
"type": "page",
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2016-01-01"
}
}
}]
}
}
}
},
"inner_hits": {
"size": 1
}
}
}
}
We do a has_child query on the parent type and therefor receive only distinct urls of the parent type. To get the specific pages, we have to add an inner_hits query, which gives us the child documents leading to the hits in the parent type.
If you set inner_hits size to 1, you get only one page per domain.
You can even add a sorting in the inner_hits query... For example, you can sort by the crawl_date. ;)
The above search gives you the following result:
"hits": [
{
"_index": "sites",
"_type": "domain",
"_id": "starbucks.com",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 1,
"max_score": 1.9664046,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "starbucks.com/index.html",
"_score": 1.9664046,
"_routing": "starbucks.com",
"_parent": "starbucks.com",
"_source": {
"crawl_date": "1982-05-16"
}
}
]
}
}
}
},
{
"_index": "sites",
"_type": "domain",
"_id": "dailymail.co.uk",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 1,
"max_score": 1.9664046,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "dailymail.co.uk/index.html",
"_score": 1.9664046,
"_routing": "dailymail.co.uk",
"_parent": "dailymail.co.uk",
"_source": {}
}
]
}
}
}
},
{
"_index": "sites",
"_type": "domain",
"_id": "barnesandnoble.com",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 2,
"max_score": 1.4142135,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "barnesandnoble.com/test.html",
"_score": 1.4142135,
"_routing": "barnesandnoble.com",
"_parent": "barnesandnoble.com",
"_source": {
"crawl_date": "1982-05-16"
}
}
]
}
}
}
}
]
Finally, let me note one thing. Parent/child relationship comes with small costs at query time. If this isn't a problem for your use case, I'd go for this solution.
I suggest you use the exists filter instead of trying to match an empty term (the missing filter is deprecated in 2.2). Then, the range filter will help you filter out the documents you don't need.
Finally, since you have used the absolute URL as id, make sure to aggregate on the _uid field and not the domain field, that way you'll get unique counts per exact page.
curl -XPOST 'http://localhost:9200/tester/test/_search' -d '{
"size": 10,
"aggs": {
"group_by_domain": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2014-12-31T00:00:00.000"
}
}
}
]
}
},
"aggs": {
"domains": {
"terms": {
"field": "_uid"
}
}
}
}
}
}'
You have to use Filter Aggregation and then sub-aggregation
{
"size": 10,
"aggs": {
"filter_date": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": [
{
"exists": {
"field": "crawl_date"
}
}
]
}
},
{
"range": {
"crawl_date": {
"from": "now-100d"
}
}
}
]
}
},
"aggs": {
"group_by_domain": {
"terms": {
"field": "domain"
}
}
}
}
}
}

boosting along with prefix query with all

I want to be able to prefix query on EACH of search terms found in any field, and I would like to be able to have highlighting. I formulated a query which seems to work. Now, I want to update query so that matches in one of the fields yields a higher score than matches in the other fields.
For example I index the following data (this is just a sample, in my real data there are many more fields than just the two):
PUT /my_index/my_type/abc124
{
"title" : "blah",
"description" : "golf"
}
PUT /my_index/my_type/abc123
{
"title" : "blah golf",
"description" : "course"
}
PUT /my_index/my_type/abc125
{
"title" : "blah golf tee",
"description" : "course"
}
Then I can query as mentioned with a query like:
POST my_index/my_type/_search
{
"query": {
"bool": {
"must": [
{
"prefix": {
"_all" : "gol"
}
},
{
"prefix": {
"_all": "bla"
}
}
]
}
},
"highlight":{
"fields":{
"*":{}
}
}
}
Which produces the result:
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 4,
"successful": 4,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.4142135,
"hits": [
{
"_index": "my_index",
"_type": "my_type",
"_id": "abc125",
"_score": 1.4142135,
"_source": {
"title": "blah golf tee",
"description": "course"
},
"highlight": {
"title": [
"<em>blah</em> <em>golf</em> tee"
]
}
},
{
"_index": "my_index",
"_type": "my_type",
"_id": "abc124",
"_score": 1.4142135,
"_source": {
"title": "blah",
"description": "golf"
},
"highlight": {
"description": [
"<em>golf</em>"
],
"title": [
"<em>blah</em>"
]
}
},
{
"_index": "my_index",
"_type": "my_type",
"_id": "abc123",
"_score": 1.4142135,
"_source": {
"title": "blah golf",
"description": "course"
},
"highlight": {
"title": [
"<em>blah</em> <em>golf</em>"
]
}
}
]
}
}
How can I modify the scoring using function_score or other means so that I can score matches on title field higher than other fields? Do I need to change the query to multi-match instead of using _all? Any suggestions would be appreciated.
Regards,
LT
Try adding to your bool query a should section which would give a higher score to the whole query if any of the statements in the should match (and it's not mandatory for those to match for the query to return results).
For example, try this:
POST my_index/my_type/_search
{
"query": {
"bool": {
"must": [
{
"prefix": {
"_all": "gol"
}
},
{
"prefix": {
"_all": "bla"
}
}
],
"should": [
{
"prefix": {
"title": {
"value": "gol",
"boost": 3
}
}
},
{
"prefix": {
"title": {
"value": "bla",
"boost": 3
}
}
}
]
}
},
"highlight": {
"fields": {
"*": {}
}
}
}

Resources