Elasticsearch Copy_to data need to copied self subdocument - elasticsearch

Thanks in advance for helping.
I have created ES mapping as :
{"mappings": {
"policy": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"tags": {
"properties": {
"scope": {
"type": "text",
"store": "true",
"copy_to": [
"tags.tag_scope"
]
},
"tag": {
"type": "text",
"store": "true",
"copy_to": [
"tags.tag_scope"
]
},
"tag_scope": {
"type": "text",
"store": "true"
}
}
}
}
}
}
}
When i index policy document all tag and scope value from different tags document copied to tag_scope property.
For Example I added a document as to elastic search:
{
"name": "policy1",
"tags": [
{
"tag": "pepsi",
"scope": "prod"
},
{
"tag": "coke",
"scope": "dev"
}
]
}
It is storing all 4 values as in tag_scope documents as:
"tags.tag_scope": [
"pepsi",
"test",
"coke",
"dev"
]
My Exceptions was, it should store like :
{
"name": "policy1",
"tags": [
{
"tag": "pepsi",
"scope": "prod",
"tag_scope" : ["pepsi","prod"]
},
{
"tag": "coke",
"scope": "dev",
"tag_scope" : ["coke","dev"]
}
]
}
Could you please help me to do correct mapping for same?

What you are looking for is Nested Datatype. Change your mapping to the below:
PUT <your_index_name>
{
"mappings":{
"policy":{
"properties":{
"name":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":256
}
}
},
"tags":{
"type": "nested",
"properties":{
"scope":{
"type":"text",
"store":"true",
"copy_to":[
"tags.tag_scope"
]
},
"tag":{
"type":"text",
"store":"true",
"copy_to":[
"tags.tag_scope"
]
},
"tag_scope":{
"type":"text",
"store":"true",
"fields": { <---- Added this
"keyword": {
"type": "keyword"
}
}
}
}
}
}
}
}
}
Notice how I've made tags as nested type. This would allow the below to be stored as individual document itself, which in your case tags basically has two nested documents.
{
"tag":"coke",
"scope":"dev"
}
Now your tags.tag_scope should be what you are expecting it to be.
Now when it comes to querying for what you are looking for, the below is how a Nested Query should be.
Nested Query:
POST <your_index_name>/_search
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "tags",
"query": {
"bool": {
"must": [
{
"match": {
"tags.tag_scope": "pepsi"
}
},
{
"match": {
"tags.tag_scope": "prod"
}
}
]
}
}
}
}
]
}
}
}
As to return the list of unique tags.tag_scope value you would need to return aggregation query. Notice that I've mentioned size:0 which means I only want to see aggregation result and not normal query results.
Aggregation Query:
POST <your_index_name>/_search
{
"size":0,
"query":{
"bool":{
"must":[
{
"nested":{
"path":"tags",
"query":{
"bool":{
"must":[
{
"match":{
"tags.tag_scope":"pepsi"
}
},
{
"match":{
"tags.tag_scope":"prod"
}
}
]
}
}
}
}
]
}
},
"aggs":{ <----- Aggregation Query Starts Here
"myscope":{
"nested":{
"path":"tags"
},
"aggs":{
"uniqui_scope":{
"terms":{
"field":"tags.tag_scope.keyword",
"size":10
}
}
}
}
}
}
Aggregation Response:
{
"took": 53,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"myscope": {
"doc_count": 2,
"uniqui_scope": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "coke",
"doc_count": 1
},
{
"key": "dev",
"doc_count": 1
},
{
"key": "pepsi",
"doc_count": 1
},
{
"key": "prod",
"doc_count": 1
}
]
}
}
}
}
Hope this helps.

Related

Need help to correctly perform wildcard search on a field

My sData.Name mapping looks as below:
{
"abc_history": {
"mappings": {
"abc-data-type": {
"sData.Name": {
"full_name": "sData.Name",
"mapping": {
"Name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
My sData.startDate mapping looks as below
{
"abc_history": {
"mappings": {
"abc-data-type": {
"sData.startDate": {
"full_name": "sData.startDate",
"mapping": {
"startDate": {
"type": "date"
}
}
}
}
}
}
}
I am trying to perform a wildcard search on sData.Name and used following query:
{
"from": 0,
"size": 20,
"query": {
"bool": {
"must":[
{"range": {"requestDate": { "gte": "2019-10-01T08:00:00.000Z" }}},
{
"wildcard": {
"sData.Name": "*Scream*"
}
}
]
}
},
"sort": [
{ "requestDate": {"order": "desc"}}
]
}
The above query is returning empty response.
How should I modify my query so that I can perform wildcard search on sData.Name
Response from http://{serverhost}:{port}/abc_history/_search looks as below:
{
"took": 181,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "abc_history",
"_type": "abc-data-type",
"_id": "5e29cbb7965809fe6cb22a7b",
"_score": null,
"_source": {
"sData": [
{
"status": "ASSIGNED",
"Name": "CloudView abcmission Automation Support",
startDate : "2020-01-26T20:12:57.091Z"
},
{
"status": "RESOLVED",
"Name": "DSE - Tools Engineering",
startDate : "2020-01-27T20:12:57.091Z"
},
{
"status": "CLOSED",
"Name": "abcmission Orchestration",
startDate : "2020-01-29T20:12:57.091Z"
},
{
"status": "ASSIGNED",
"Name": "CloudView abcmission Automation Support",
startDate : "2020-01-29T20:19:29.687Z"
}
]
},
"sort": [
1579797431366
]
}
]
}
}
I am mainly concerned about querying sData.Name. I want to perform search only in the last array element. So in my case I want to search only sData[3].Name In other words the keyword DSE should be searched within "Name": "CloudView abcmission Automation Support" only
I try to create the index by your input. Try to use
"wildcard": {
"sData.Name.keyword": {
"wildcard": "*DSE*",
"boost": 1
}
}
The full query is:
PUT /abc_history
{
"mappings": {
"abc-data-type": {
"properties": {
"sData": {
"properties": {
"status": {
"type": "keyword"
},
"Name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
GET /abc_history/_search
{
"from": 0,
"size": 200,
"query": {
"bool": {
"filter": [
{
"bool": {
"must": [
{
"wildcard": {
"sData.Name.keyword": {
"wildcard": "*DSE*",
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
}
It may
GET /abc_history/_search
{
"from": 0,
"size": 200,
"query": {
"bool": {
"filter": [
{
"bool": {
"must": [
{
"wildcard": {
"sData.Name": {
"wildcard": "*ddd*",
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
},
"sort": [
{
"sData.startDate": {
"order": "asc"
}
}
]
}

ElasticSearch 2.4 - Query String OR between main type and nested

I'm using ElasticSearch 2.4
I need to create a search query between main object and nested object, if I use the AND condition it works correctly, but the problem is if I try to use OR conditional between main object and nested object:
Please review the code below and tell me if there is a way to make it work using OR conditional.
Create mapping:
PUT /example_contact_purchases
{
"mappings": {
"contact": {
"dynamic": false,
"properties": {
"name": {
"type": "string"
},
"country": {
"type": "string"
},
"purchases": {
"type": "nested",
"properties": {
"uuid":{
"type":"string"
}
}
}
}
}
}
}
Mapping result:
GET example_contact_purchases/_mapping
{
"example_contact_purchases": {
"mappings": {
"contact": {
"dynamic": "false",
"properties": {
"country": {
"type": "string"
},
"name": {
"type": "string"
},
"purchases": {
"type": "nested",
"properties": {
"uuid": {
"type": "string"
}
}
}
}
}
}
}
}
Create First Item:
POST example_contact_purchases/contact
{
"name" : "Fran",
"country": "ES",
"purchases" : [
{
"uuid" : "23"
}
]
}
Create Second Item:
POST example_contact_purchases/contact
{
"name" : "Jhon",
"country": "UK",
"purchases" : [
{
"uuid" : "45"
}
]
}
Create Third Item:
POST example_contact_purchases/contact
{
"name" : "Leonardo",
"country": "IT",
"purchases" : [
{
"uuid" : "45"
}
]
}
Example Query: Country == ES AND purchase.uuid == 23
GET example_contact_purchases/_search
{
"query":{
"filtered":{
"query":{
"query_string":{
"query":"country:ES"
}
}
}
},
"filter":{
"nested":{
"path":"purchases",
"filter":{
"query":{
"query_string":{
"query":"(purchases.uuid:23)"
}
}
}
}
}
}
Result:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "example_contact_purchases",
"_type": "contact",
"_id": "AW_nkURti9zva2kl7ESR",
"_score": 1,
"_source": {
"name": "Fran",
"country": "ES",
"purchases": [
{
"uuid": "23"
}
]
}
}
]
}
}
Target Query: Country== "ES" OR purchase.uuid== 45
You can do that with bool query. Your query will look something like this:
POST example_contact_purchases/contact/_search
{
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "country:ES"
}
},
{
"nested": {
"path": "purchases",
"filter": {
"query": {
"query_string": {
"query": "(purchases.uuid:45)"
}
}
}
}
}
]
}
}
}
Notice that should == OR in this case.
Also, since you are querying these fields (country and purchases.uuid) by exact value you may consider setting them as not_analyzed (or keyword in modern versions of Elasticsearch) and use exact match query like term.
Hope that helps!

Group by a part of string from a field rather than the full field in Elasticsearch

Here structure of my index:
[
{
"Id":"1",
"Path":"/Series/Current/SerieA/foo/foo",
"PlayCount":100
},
{
"Id":"2",
"Path":"/Series/Current/SerieA/bar/foo",
"PlayCount":1000
},
{
"Id":"3",
"Path":"/Series/Current/SerieA/bar/bar",
"PlayCount":50
},
{
"Id":"4",
"Path":"/Series/Current/SerieB/bla/bla",
"PlayCount":300
},
{
"Id":"5",
"Path":"/Series/Current/SerieB/goo/boo",
"PlayCount":200
},
{
"Id":"6",
"Path":"/Series/Current/SerieC/foo/zoo",
"PlayCount":100
}
]
I'd like to execute an aggregation that bring me sum of "PlayCount" for each Series like:
[
{
"key":"serieA",
"TotalPlayCount":1150
},
{
"key":"serieB",
"TotalPlayCount":500
},
{
"key":"serieC",
"TotalPlayCount":100
}
]
This is how I try to do it but obviously query fails since this is not the proper way:
{
"size": 0,
"query":{
"filtered":{
"query":{
"regexp":{
"Path":"/Series/Current/.*"
}
}
}
},
"aggs":{
"play_count_for_current_series":{
"terms": {
"field": "Path",
"regexp": "/Series/Current/([^/]+)"
},
"aggs":{
"Total_play": { "sum": { "field": "PlayCount" } }
}
}
}
}
Is there a way to do it?
My suggestion is as follows:
DELETE test
PUT /test
{
"settings": {
"analysis": {
"filter": {
"my_special_filter": {
"type": "pattern_capture",
"preserve_original": 0,
"patterns": [
"/Series/Current/([^/]+)"
]
}
},
"analyzer": {
"my_special_analyzer": {
"tokenizer": "whitespace",
"filter": [
"my_special_filter"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"Path": {
"type": "string",
"fields": {
"for_aggregations": {
"type": "string",
"analyzer": "my_special_analyzer"
},
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Create a special analyzer that uses a pattern_capture filter to catch only those terms that you are interested. Because I didn't want to change your current mapping for that field I added a fields section with a sub-field that will use this special analyzer. I also added a raw field which is not_analyzed which will help with the query itself.
POST test/test/_bulk
{"index":{}}
{"Id":"1","Path":"/Series/Current/SerieA/foo/foo","PlayCount":100}
{"index":{}}
{"Id":"2","Path":"/Series/Current/SerieA/bar/foo","PlayCount":1000}
{"index":{}}
{"Id":"3","Path":"/Series/Current/SerieA/bar/bar","PlayCount":50}
{"index":{}}
{"Id":"4","Path":"/Series/Current/SerieB/bla/bla","PlayCount":300}
{"index":{}}
{"Id":"5","Path":"/Series/Current/SerieB/goo/boo","PlayCount":200}
{"index":{}}
{"Id":"6","Path":"/Series/Current/SerieC/foo/zoo","PlayCount":100}
{"index":{}}
{"Id":"7","Path":"/Sersdasdies/Curradent/SerieC/foo/zoo","PlayCount":100}
For the query, you don't need the regular expression in the query because your aggregation will use that sub-field which only has your needed SerieX terms.
GET /test/test/_search
{
"size": 0,
"query": {
"filtered": {
"query": {
"regexp": {
"Path.raw": "/Series/Current/.*"
}
}
}
},
"aggs": {
"play_count_for_current_series": {
"terms": {
"field": "Path.for_aggregations"
},
"aggs": {
"Total_play": {
"sum": {
"field": "PlayCount"
}
}
}
}
}
}
And the result is
"play_count_for_current_series": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "SerieA",
"doc_count": 3,
"Total_play": {
"value": 1150
}
},
{
"key": "SerieB",
"doc_count": 2,
"Total_play": {
"value": 500
}
},
{
"key": "SerieC",
"doc_count": 1,
"Total_play": {
"value": 100
}
}
]
}

ElasticSearch: Is it possible to produce a "Temporary Field" during a search request?

Sample Document:
{
"text": "this is my text",
"categories": [
{"category": "sample category"},
{"category": "local news"}
]
}
The mapping currently is:
{
"topic": {
"properties": {
"categories": {
"properties": {
"category": {
"type": "string",
"store": "no",
"term_vector": "with_positions_offsets",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"include_in_all": "true",
"boost": 8,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
Search query:
{
"_source": false,
"query":{
"match":{
"categories.category":"news"
}
},
"aggs": {
"match_count": {
"terms" : {"field": "categories.category.raw"}
}
}
}
The result I want it to be:
{
...
"buckets": [
{
"key": "local news",
"doc_count": 1
}
]
...
}
The result actually is (it aggregates all matching documents' categories.category):
{
...
"buckets": [
{
"key": "local news",
"doc_count": 1
},{
"key": "sample category", //THIS PART IS NOT NEEDED
"doc_count": 1
}
]
...
}
Is it possible to add a temporary field during a search? In this case let's say name all the matching categories.category as categories.match_category, and aggregates by this temporary field categories.match_category? If true how can I do it and if not what should I do then?
You have multiple documents within your document and you need to match against some of them, you should probably change mapping into nested documents as follows:
mapping
{
"topic": {
"properties": {
"categories": {
"type":"nested",
"properties": {
"category": {
"type": "string",
"store": "no",
"term_vector": "with_positions_offsets",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"include_in_all": "true",
"boost": 8,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
Then you can perform your query as follows
{
"_source": false,
"query":{
"filtered":{
"query":{
"match":{
"categories.category":
{
"query" : "news",
"cutoff_frequency" : 0.001
}
}
}
}
},
"aggs": {
"categ": {
"nested" : {
"path" : "categories"
},
"aggs":{
"match_count": {
"terms" : {"field": "categories.category.raw"}
}
}
}
}
}
Try it
Another approach but with a more specific to your needs logic is the following:
mapping
{
"topic": {
"properties": {
"categories": {
"type":"nested",
"properties": {
"category": {
"type": "string",
"store": "no",
"analyzer": "simple",
"include_in_all": "true",
"boost": 8,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
data
{
"text": "this is my text",
"categories": [
{"category": "sample category"},
{"category": "local news"}
]
}
query
{
"query":{
"nested":{
"path":"categories",
"query":{
"filtered":{
"query":{
"match":{
"categories.category":"news"
}
}
}
}
}
},
"aggs": {
"nest":{
"nested":{
"path":"categories"
},
"aggs":{
"filt":{
"filter" : {
"script": {
"script" : "doc['categories.category'].values.contains('news')"
}
},
"aggs":{
"match_count": {
"terms" : {"field": "categories.category.raw"}
}
}
}
}
}
}
}
produced result
{
"_shards": {
"failed": 0,
"successful": 5,
"total": 5
},
"aggregations": {
"nest": {
"doc_count": 2,
"filt": {
"doc_count": 1,
"match_count": {
"buckets": [
{
"doc_count": 1,
"key": "local news"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
}
},
"hits": {
"hits": [],
"max_score": 0.0,
"total": 1
},
"timed_out": false,
"took": 3
}
The catch here is that you have to create your own, according to your needs script filter in the aggregation, the above example worked for me with a simple analyzer in the "category" mapping

Elasticsearch: generating terms from array using script

Would love an explanation of why this happens and how to correct it.
Here's a snippet of the source document:
{
"created_time":1412988495000,
"tags":{
"items":[
{
"tag_type":"Placement",
"tag_id":"id1"
},
{
"tag_type":"Product",
"tag_id":"id2"
}
]
}
}
The following terms aggregation:
"aggs":{
"tags":{
"terms":{
"script":"doc['tags'].value != null ? doc['tags.items.tag_type'].value + ':' + doc['tags.items.tag_id'].value : ''",
"size":2000,
"exclude":{
"pattern":"null:null"
}
}
}
}
returns:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Placement:id2",
"doc_count":1
}
]
...when you would expect:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Product:id2",
"doc_count":1
}
]
I would probably go with a nested type. I don't know all the details of your setup, but here is a proof of concept, at least. I took out the "items" property because I didn't need that many layers, and just used "tags" as the nested type. It could be added back in if needed, I think.
So I set up an index with a "nested" property:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"doc": {
"properties": {
"created_time": {
"type": "date"
},
"tags": {
"type": "nested",
"properties": {
"tag_type": {
"type": "string",
"index": "not_analyzed"
},
"tag_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then added a couple of docs (notice that the structure differs slightly from yours):
PUT /test_index/doc/1
{
"created_time": 1412988495000,
"tags": [
{
"tag_type": "Placement",
"tag_id": "id1"
},
{
"tag_type": "Product",
"tag_id": "id2"
}
]
}
PUT /test_index/doc/2
{
"created_time": 1412988475000,
"tags": [
{
"tag_type": "Type3",
"tag_id": "id3"
},
{
"tag_type": "Type4",
"tag_id": "id3"
}
]
}
Now a scripted terms aggregation inside a nested aggregation seems to do the trick:
POST /test_index/_search?search_type=count
{
"query": {
"match_all": {}
},
"aggs": {
"tags": {
"nested": { "path": "tags" },
"aggs":{
"tag_vals": {
"terms": {
"script": "doc['tag_type'].value+':'+doc['tag_id'].value"
}
}
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"tags": {
"doc_count": 4,
"tag_vals": {
"buckets": [
{
"key": "Placement:id1",
"doc_count": 1
},
{
"key": "Product:id2",
"doc_count": 1
},
{
"key": "Type3:id3",
"doc_count": 1
},
{
"key": "Type4:id3",
"doc_count": 1
}
]
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/4ceaf8693f85ff257c2fd0639ba62295f2e5e8c5

Resources