Elasticsearch Multi-Term Auto Completion - elasticsearch

I'm trying to implement the Multi-Term Auto Completion that's presented here.
Filtering down to the correct documents works, but when aggregating the completion_terms they are not filtered to those that match the current partial query, but instead include all completion_terms from any matched documents.
Here are the mappings:
{
"mappings": {
"dynamic" : "false",
"properties" : {
"completion_ngrams" : {
"type" : "text",
"analyzer" : "completion_ngram_analyzer",
"search_analyzer" : "completion_ngram_search_analyzer"
},
"completion_terms" : {
"type" : "keyword",
"normalizer" : "completion_normalizer"
}
}
}
}
Here are the settings:
{
"settings" : {
"index" : {
"analysis" : {
"filter" : {
"edge_ngram" : {
"type" : "edge_ngram",
"min_gram" : "1",
"max_gram" : "10"
}
},
"normalizer" : {
"completion_normalizer" : {
"filter" : [
"lowercase",
"german_normalization"
],
"type" : "custom"
}
},
"analyzer" : {
"completion_ngram_search_analyzer" : {
"filter" : [
"lowercase"
],
"tokenizer" : "whitespace"
},
"completion_ngram_analyzer" : {
"filter" : [
"lowercase",
"edge_ngram"
],
"tokenizer" : "whitespace"
}
}
}
}
}
}
}
I'm then indexing data like this:
{
"completion_terms" : ["Hammer", "Fortis", "Tool", "2000"],
"completion_ngrams": "Hammer Fortis Tool 2000"
}
Finally, the autocomplete search looks like this:
{
"query": {
"bool": {
"must": [
{
"term": {
"completion_terms": "fortis"
}
},
{
"term": {
"completion_terms": "hammer"
}
},
{
"match": {
"completion_ngrams": "too"
}
}
]
}
},
"aggs": {
"autocomplete": {
"terms": {
"field": "completion_terms",
"size": 100
}
}
}
}
This correctly returns documents matching the search string "fortis hammer too", but the aggregations include ALL completion terms that are included in any of the matched documents, e.g. for the query above:
"buckets": [
{ "key": "fortis" },
{ "key": "hammer" },
{ "key": "tool" },
{ "key": "2000" },
]
Ideally, I'd expect
"buckets": [
{ "key": "tool" }
]
I could filter out the terms that are already covered by the search query ("fortis" and "hammer" in this case) in the app, but the "2000" doesn't make any sense from a user's perspective, because it doesn't partially match any of the provided search terms.
I understand why this is happening, but I can't think of a solution. Can anyone help?

try filters agg please
{
"query": {
"bool": {
"must": [
{
"term": {
"completion_terms": "fortis"
}
},
{
"term": {
"completion_terms": "hammer"
}
},
{
"match": {
"completion_ngrams": "too"
}
}
]
}
},
"aggs": {
"findOuthammerAndfortis": {
"filters": {
"filters": {
"fortis": {
"term": {
"completion_terms": "fortis"
}
},
"hammer": {
"term": {
"completion_terms": "hammer"
}
}
}
}
}
}
}

Related

ElasticSearch Aggregation Filter (not nested) Array

I have mapping like that:
PUT myindex1/_mapping
{
"properties": {
"program":{
"properties":{
"rounds" : {
"properties" : {
"id" : {
"type" : "keyword"
},
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
And my example docs:
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000000", "name":"Test1"},
{"id":"00000000-0000-0000-0000-000000000001", "name":"Fact2"}
]
}
}
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000002", "name":"Test3"},
{"id":"00000000-0000-0000-0000-000000000003", "name":"Fact4"}
]
}
}
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000004", "name":"Test5"},
{"id":"00000000-0000-0000-0000-000000000005", "name":"Fact6"}
]
}
}
Purpose: get only names of rounds that filtered as wildcard by user.
Aggregation query:
GET myindex1/_search
{
"aggs": {
"result": {
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"order": {
"_key": "asc"
}
}
}
},
"filter": {
"bool": {
"must":[
{
"wildcard": {
"program.rounds.name": "*test*"
}
}
]
}
}
}
},
"size": 0
}
This aggregation returns all 6 names, but I need only Test1,Test3,Test5. Also tried include": "/tes.*/i" regex pattern for terms, but ignore case does not work.
Note: I'm note sure abount nested type, because I don't interested in association between Id and Name (at least for now).
ElasticSearch version: 7.7.0
If you want to only aggregate specific rounds based on a condition on the name field, then you need to make rounds nested, otherwise all name values end up in the same field.
Your mapping needs to be changed to this:
PUT myindex1/
{
"mappings": {
"properties": {
"program": {
"properties": {
"rounds": {
"type": "nested", <--- add this
"properties": {
"id": {
"type": "keyword"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
}
And then your query needs to change to this:
GET myindex1/_search
{
"size": 0,
"query": {
"nested": {
"path": "program.rounds",
"query": {
"bool": {
"must": [
{
"wildcard": {
"program.rounds.name": "*Test*"
}
}
]
}
}
}
},
"aggs": {
"rounds": {
"nested": {
"path": "program.rounds"
},
"aggs": {
"name_filter": {
"filter": {
"wildcard": {
"program.rounds.name": "*Test*"
}
},
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"order": {
"_key": "asc"
}
}
}
}
}
}
}
}
}
And the result will be:
"aggregations" : {
"rounds" : {
"doc_count" : 6,
"name_filter" : {
"doc_count" : 3,
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Test1",
"doc_count" : 1
},
{
"key" : "Test3",
"doc_count" : 1
},
{
"key" : "Test5",
"doc_count" : 1
}
]
}
}
}
}
UPDATE:
Actually, you can achieve what you want without introducing nested types with the following query. You were close, but the include pattern was wrong
GET myindex1/_search
{
"aggs": {
"result": {
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"include": "[Tt]est.*",
"order": {
"_key": "asc"
}
}
}
},
"filter": {
"bool": {
"must": [
{
"wildcard": {
"program.rounds.name": "*Test*"
}
}
]
}
}
}
},
"size": 0
}

How to Query elasticsearch index with nested and non nested fields

I have an elastic search index with the following mapping:
PUT /student_detail
{
"mappings" : {
"properties" : {
"id" : { "type" : "long" },
"name" : { "type" : "text" },
"email" : { "type" : "text" },
"age" : { "type" : "text" },
"status" : { "type" : "text" },
"tests":{ "type" : "nested" }
}
}
}
Data stored is in form below:
{
"id": 123,
"name": "Schwarb",
"email": "abc#gmail.com",
"status": "current",
"age": 14,
"tests": [
{
"test_id": 587,
"test_score": 10
},
{
"test_id": 588,
"test_score": 6
}
]
}
I want to be able to query the students where name like '%warb%' AND email like '%gmail.com%' AND test with id 587 have score > 5 etc. The high level of what is needed can be put something like below, dont know what would be the actual query, apologize for this messy query below
GET developer_search/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "abc"
}
},
{
"nested": {
"path": "tests",
"query": {
"bool": {
"must": [
{
"term": {
"tests.test_id": IN [587]
}
},
{
"term": {
"tests.test_score": >= some value
}
}
]
}
}
}
}
]
}
}
}
The query must be flexible so that we can enter dynamic test Ids and their respective score filters along with the fields out of nested fields like age, name, status
Something like that?
GET student_detail/_search
{
"query": {
"bool": {
"must": [
{
"wildcard": {
"name": {
"value": "*warb*"
}
}
},
{
"wildcard": {
"email": {
"value": "*gmail.com*"
}
}
},
{
"nested": {
"path": "tests",
"query": {
"bool": {
"must": [
{
"term": {
"tests.test_id": 587
}
},
{
"range": {
"tests.test_score": {
"gte": 5
}
}
}
]
}
},
"inner_hits": {}
}
}
]
}
}
}
Inner hits is what you are looking for.
You must make use of Ngram Tokenizer as wildcard search must not be used for performance reasons and I wouldn't recommend using it.
Change your mapping to the below where you can create your own Analyzer which I've done in the below mapping.
How elasticsearch (albiet lucene) indexes a statement is, first it breaks the statement or paragraph into words or tokens, then indexes these words in the inverted index for that particular field. This process is called Analysis and that this would only be applicable on text datatype.
So now you only get the documents if these tokens are available in inverted index.
By default, standard analyzer would be applied. What I've done is I've created my own analyzer and used Ngram Tokenizer which would be creating many more tokens than just simply words.
Default Analyzer on Life is beautiful would be life, is, beautiful.
However using Ngrams, the tokens for Life would be lif, ife & life
Mapping:
PUT student_detail
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 4,
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings" : {
"properties" : {
"id" : {
"type" : "long"
},
"name" : {
"type" : "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"email" : {
"type" : "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"age" : {
"type" : "text" <--- I am not sure why this is text. Change it to long or int. Would leave this to you
},
"status" : {
"type" : "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"tests":{
"type" : "nested"
}
}
}
}
Note that in the above mapping I've created a sibling field in the form of keyword for name, email and status as below:
"name":{
"type":"text",
"analyzer":"my_analyzer",
"fields":{
"keyword":{
"type":"keyword"
}
}
}
Now your query could be as simple as below.
Query:
POST student_detail/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "war" <---- Note this. This would even return documents having "Schwarb"
}
},
{
"match": {
"email": "gmail" <---- Note this
}
},
{
"nested": {
"path": "tests",
"query": {
"bool": {
"must": [
{
"term": {
"tests.test_id": 587
}
},
{
"range": {
"tests.test_score": {
"gte": 5
}
}
}
]
}
}
}
}
]
}
}
}
Note that for exact matches I would make use of Term Queries on keyword fields while for normal searches or LIKE in SQL I would make use of simple Match Queries on text Fields provided they make use of Ngram Tokenizer.
Also note that for >= and <= you would need to make use of Range Query.
Response:
{
"took" : 233,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 3.7260926,
"hits" : [
{
"_index" : "student_detail",
"_type" : "_doc",
"_id" : "1",
"_score" : 3.7260926,
"_source" : {
"id" : 123,
"name" : "Schwarb",
"email" : "abc#gmail.com",
"status" : "current",
"age" : 14,
"tests" : [
{
"test_id" : 587,
"test_score" : 10
},
{
"test_id" : 588,
"test_score" : 6
}
]
}
}
]
}
}
Note that I observe the document you've mentioned in your question, in my response when I run the query.
Please do read the links I've shared. It is vital that you understand the concepts. Hope this helps!

Elasticsearch aggregation by arrays of String

I have an ElasticSearch index, where I store telephony transactions (SMS, MMS, Calls, etc ) with their associated costs.
The key of these documents are the MSISDN (MSISDN = phone number). In my app, I know that there are group of users. Each users can have one or more MSISDN.
Here is the mapping of this kind of documents :
"mappings" : {
"cdr" : {
"properties" : {
"callDatetime" : {
"type" : "long"
},
"callSource" : {
"type" : "string"
},
"callType" : {
"type" : "string"
},
"callZone" : {
"type" : "string"
},
"calledNumber" : {
"type" : "string"
},
"companyKey" : {
"type" : "string"
},
"consumption" : {
"properties" : {
"data" : {
"type" : "long"
},
"voice" : {
"type" : "long"
}
}
},
"cost" : {
"type" : "double"
},
"country" : {
"type" : "string"
},
"included" : {
"type" : "boolean"
},
"msisdn" : {
"type" : "string"
},
"network" : {
"type" : "string"
}
}
}
}
My goal and issue :
My goal is to make a query that retrieve cost by callType by group. But groups are not represented in ElasticSearch, only in my PostgreSQL database.
So I will make a method that retrieves all the MSISDN for every existing group, and get something like a List of String arrays, containing every MSISDN within each group.
Let's say I have something like :
"msisdn_by_group" : [
{
"group1" : ["01111111111", "02222222222", "033333333333", "044444444444"]
},
{
"group2" : ["05555555555","06666666666"]
}
]
Now, I will use this to generate an Elasticsearch query. I want to make with an aggregation, the sum of the cost, for all those terms in different buckets, and then split it again by callType. (to make a stackedbar chart).
I've tried several things, but didn't manage to make it work (histogram, buckets, term and sum was mainly the keyword i'm playing with).
If somebody here can help me with the order, and the keywords I can use to achieve this, it would be great :) Thanks
EDIT :
Here is my last try :
QUERY:
{
"aggs" : {
"cost_histogram": {
"terms": {
"field": "callType"
},
"aggs": {
"cost_histogram_sum" : {
"sum": {
"field": "cost"
}
}
}
}
}
}
I go the expected result, but it missing the "group" split, as I don't know how to pass the MSISDN arrays as a criteria :
RESULT :
"aggregations": {
"cost_histogram": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "data",
"doc_count": 5925,
"cost_histogram_sum": {
"value": 0
}
},
{
"key": "sms_mms",
"doc_count": 5804,
"cost_histogram_sum": {
"value": 91.76999999999995
}
},
{
"key": "voice",
"doc_count": 5299,
"cost_histogram_sum": {
"value": 194.1196
}
},
{
"key": "sms_mms_plus",
"doc_count": 35,
"cost_histogram_sum": {
"value": 7.2976
}
}
]
}
}
Ok I found out how to make this with one query, but it's damn a long query because it repeats for every group, but I have no choise. I'm using the "filter" aggregator.
Here is a working example based on the array I wrote in my question above :
POST localhost:9200/cdr/_search?size=0
{
"query": {
"term" : {
"companyKey" : 1
}
},
"aggs" : {
"group_1_split_cost": {
"filter": {
"bool": {
"should": [{
"bool": {
"must": {
"match": {
"msisdn": "01111111111"
}
}
}
},{
"bool": {
"must": {
"match": {
"msisdn": "02222222222"
}
}
}
},{
"bool": {
"must": {
"match": {
"msisdn": "03333333333"
}
}
}
},{
"bool": {
"must": {
"match": {
"msisdn": "04444444444"
}
}
}
}]
}
},
"aggs": {
"cost_histogram": {
"terms": {
"field": "callType"
},
"aggs": {
"cost_histogram_sum" : {
"sum": {
"field": "cost"
}
}
}
}
}
},
"group_2_split_cost": {
"filter": {
"bool": {
"should": [{
"bool": {
"must": {
"match": {
"msisdn": "05555555555"
}
}
}
},{
"bool": {
"must": {
"match": {
"msisdn": "06666666666"
}
}
}
}]
}
},
"aggs": {
"cost_histogram": {
"terms": {
"field": "callType"
},
"aggs": {
"cost_histogram_sum" : {
"sum": {
"field": "cost"
}
}
}
}
}
}
}
}
Thanks to the newer versions of Elasticsearch we can now nest very deep aggregations, but it's still a bit too bad that we can't pass arrays of values to an "OR" operator or something like that. It could reduce the size of those queries, I guess. Even if they are a bit special and used in niche cases, as mine.

Elastic Search 1.7.3 Nested filter: matching terms in an array of objects

I am trying to query for the following document in my elasticsearch:
"amenity": [
"Free Wifi",
"Free Breakfast",
"Veg Only",
"Swimming Pool",
"Newspaper",
"Bar",
"Credit Card",
"Pickup & Drop",
"Gym",
"Elevator",
"Valet Parking"
],
"dodont": [
{
"do_or_dont": "Do",
"what": "Vegetarians"
},
{
"do_or_dont": "Do",
"what": "Family"
},
{
"do_or_dont": "Dont",
"what": "Loud Music"
},
{
"do_or_dont": "Dont",
"what": "Booze"
}
]
and here is the query I have written:
"filter": {
"and": {
"filters": [
{
"nested" : {
"path" : "dodont",
"filter" : {
"bool" : {
"must": [{"and" : [
{
"term" : {"dodont.do_or_dont" : "Do"}
},
{
"term" : {"dodont.what" : "Vegetarians"}
}
]},
{"and" : [
{
"term" : {"dodont.do_or_dont" : "Do"}
},
{
"term" : {"dodont.what" : "Family"}
}
]}]
}
}
}
}
]
}
}
Now this query returns empty result, but when I change the "must" to "should" in the bool in above code, it returns the above document as the result (there is only 1 document matching this filter the one shown above), but ideally, the "must" condition should return the above document, I want to pass multiple objects for Do's and donts and I only want the results which match all of them, but I am not able to do so. How should I go about it?
You need to split out the two conditions on your nested document, since each element of the dodont nested array is conceptually a separate document:
{
"filter": {
"and": {
"filters": [
{
"nested": {
"path": "dodont",
"filter": {
"and": [
{
"term": {
"dodont.do_or_dont": "Do"
}
},
{
"term": {
"dodont.what": "Vegetarians"
}
}
]
}
}
},
{
"nested": {
"path": "dodont",
"filter": {
"and": [
{
"term": {
"dodont.do_or_dont": "Do"
}
},
{
"term": {
"dodont.what": "Family"
}
}
]
}
}
}
]
}
}
}

elasticsearch searching array field inside nested type

i am trying to filter my result using nested filter but i am getting incorrect result
here is my mapping info
{
"stock" : {
"mappings" : {
"clip" : {
"properties" : {
"description" : {
"type" : "string"
},
"keywords" : {
"type" : "nested",
"properties" : {
"category" : {
"type" : "string"
},
"tags" : {
"type" : "string",
"index_name" : "tag"
}
}
},
"tags" : {
"type" : "string",
"index_name" : "tag"
},
"title" : {
"type" : "string"
}
}
}
}
}
}
clip document data
{
"_index" : "stock",
"_type" : "clip",
"_id" : "AUnsTOBBpafrKleQN284",
"_score" : 1.0,
"_source":{
"title": "journey to forest",
"description": "this clip contain information about the animals",
"tags": ["birls", "wild", "animals", "roar", "forest"],
"keywords": [
{
"tags": ["spring","summer","autumn"],
"category": "Weather"
},
{
"tags": ["Cloudy","Stormy"],
"category": "Season"
},
{
"tags": ["Exterior","Interior"],
"category": "Setting"
}
]
}
i am trying to filter tags inside nested field 'keywords'
here is my query
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "keywords",
"filter": {
"bool": {
"must": [
{
"terms": { "tags": ["autumn", "summer"] }
}
]
}
}
}
}
}
}
}
i am getting no result why ?
what's wrong with my query or schema please help
The above query is syntactically incorrect . You need to provide the full path to tags from root keywords in the term query i.e.keywords.tags
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "keywords",
"filter": {
"bool": {
"must": [
{
"terms": { "keywords.tags": ["autumn", "summer"] }
}
]
}
}
}
}
}
}
}

Resources