Querying a string consisting exactly a part of a query - elasticsearch

I have a field named "lang" which consists values "en_US","en_GB","ru_RU", e.t.c. with this mapping
"lang": {
"type": "string",
"index": "not_analyzed",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
How to filter for documents, e.g. from "US"?

One way you can do it is change "index": "not_analyzed" on the upper-level field, and set up a pattern analyzer for that field. Since you already have the "lang.raw" field set up, you'll still be able to get the untouched version for faceting or whatever.
So, to test it I set up an index like this:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"whitespace_underscore": {
"type": "pattern",
"pattern": "[\\s_]+",
"lowercase": false
}
}
}
},
"mappings": {
"doc": {
"properties": {
"name": {
"type": "string"
},
"lang": {
"type": "string",
"index_analyzer": "whitespace_underscore",
"search_analyzer": "standard",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
}
}
}
}
}
And added a few docs:
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"doc1","lang":"en_US"}
{"index":{"_id":2}}
{"name":"doc2","lang":"en_GB"}
{"index":{"_id":3}}
{"name":"doc3","lang":"ru_RU"}
Now I can filter by "US" like this:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"term": {
"lang": "US"
}
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"name": "doc1",
"lang": "en_US"
}
}
]
}
}
And I can still get a list of values with a terms aggregation on "lang.raw":
POST /test_index/_search?search_type=count
{
"aggs": {
"lang_terms": {
"terms": {
"field": "lang.raw"
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"lang_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en_GB",
"doc_count": 1
},
{
"key": "en_US",
"doc_count": 1
},
{
"key": "ru_RU",
"doc_count": 1
}
]
}
}
}
Here is the code I used to test it:
http://sense.qbox.io/gist/ac3f3fd66ea649c0c3a8010241d1f6981a7e012c

Related

Elasticsearch terms aggregation returns no buckets

New elasticsearch user here and having an issue with a terms aggregation.
I have indexed 187 documents with fields like "name","host","risk" etc.
The field risk has 4 unique values ("Critical","High","Medium","Low","Informational")
I am running a terms aggregations like this:
POST http://localhost:9200/{index_name}/_search?size=0
{
"aggs":{
"riskCount":{
"terms":{
"field":"risk.keyword"
}
}
}
}
I was expecting a result stating that i have x of Critical, x of High etc.
Thing is, i get no buckets returned.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 187,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"riskCount": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
My Elasticsearch version is 7.12.0 Any ideas
Edit:
So, here's the mapping:
"findings": {
"mappings": {
"properties": {
"date_uploaded": {
"type": "date"
},
"host": {
"type": "text"
},
"name": {
"type": "text"
},
"risk": {
"type": "text"
}
}
}
}
And here's the document:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 187,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "findings",
"_type": "_doc",
"_id": "f86b6b5b-f09e-4350-9a66-d88a3a78f640",
"_score": 1.0,
"_source": {
"risk": "Informational",
"name": "HTTP Server Type and Version",
"host": "10.10.9.10",
"date_uploaded": "2021-05-07T19:39:10.810663+00:00"
}
}
]
}
}
Since the risk field is of text type, you need to update your index mapping as
PUT /_mapping
{
"properties": {
"risk": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
Then run the update_by_query API to reindex the data
You don't have any risk.keyword field in your mapping. You need to change your mapping as follows. Just run the following command to update your mapping and create the risk.keyword sub-field:
PUT index-name/_mapping
{
"properties": {
"date_uploaded": {
"type": "date"
},
"host": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"risk": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
Then reindex your data using this command:
POST index-name/_update_by_query
And then your query can be run like this:
{
"aggs":{
"riskCount":{
"terms":{
"field":"risk.keyword"
}
}
}
}

Is there anyway to sort an index before the aggregation

I have the following index template
{
"index_patterns": "notificationtiles*",
"order": 1,
"version": 1,
"aliases": {
"notificationtiles": {}
},
"settings": {
"number_of_shards": 5,
"analysis": {
"normalizer": {
"lowercase_normalizer": {
"type": "custom",
"char_filter": [],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"dynamic": "false",
"properties": {
"id": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"influencerId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"friendId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"message": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"type": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"sponsorshipCharityId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"createdTimestampEpochInMilliseconds": {
"type": "date",
"format": "epoch_millis",
"index": false
},
"updatedTimestampEpochInMilliseconds": {
"type": "date",
"format": "epoch_millis",
"index": false
},
"createdDate": {
"type": "date"
},
"updatedDate": {
"type": "date"
}
}
}
}
with the following query
{
"query": {
"bool": {
"must": [
{
"match": {
"influencerId": "52407710-f7be-49c1-bc15-6d52363144a6"
}
},
{
"match": {
"type": "friend_completed_sponsorship"
}
}
]
}
},
"size": 0,
"aggs": {
"friendId": {
"terms": {
"field": "friendId",
"size": 2
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"createdDate": {
"order": "desc"
}
}
],
"_source": {
"includes": [
"sponsorshipCharityId",
"message",
"createdDate"
]
},
"size": 1
}
}
}
}
}
}
which returns
{
"took": 72,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 12,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"friendId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 7,
"buckets": [
{
"key": "cf750fd8-998f-4dcd-9c88-56b2b6d6fce9",
"doc_count": 3,
"latest": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "notificationtiles-1",
"_type": "_doc",
"_id": "416a8e07-fd72-46d4-ade1-b9442ef46978",
"_score": null,
"_source": {
"createdDate": "2020-06-24T17:35:17.816842Z",
"sponsorshipCharityId": "336de13c-f522-4796-9218-f373ff0b4373",
"message": "Contact Test 788826 Completed Sponsorship!"
},
"sort": [
1593020117816
]
}
]
}
}
},
{
"key": "93ab55c5-795f-44b0-900c-912e3e186da0",
"doc_count": 2,
"latest": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "notificationtiles-1",
"_type": "_doc",
"_id": "66913b8f-94fe-49fd-9483-f332329b80dd",
"_score": null,
"_source": {
"createdDate": "2020-06-24T17:57:17.816842Z",
"sponsorshipCharityId": "dbad136c-5002-4470-b85d-e5ba1eff515b",
"message": "Contact Test 788826 Completed Sponsorship!"
},
"sort": [
1593021437816
]
}
]
}
}
}
]
}
}
}
However, I'd like the results to include the latest documents (ordered by createdDate desc), for example the following document
{
"_index": "notificationtiles-1",
"_type": "_doc",
"_id": "68a2a0a8-27aa-4347-8751-d7afccfa797d",
"_score": 1.0,
"_source": {
"id": "68a2a0a8-27aa-4347-8751-d7afccfa797d",
"influencerId": "52407710-f7be-49c1-bc15-6d52363144a6",
"friendId": "af342805-1990-4794-9d67-3bb2dd1e36dc",
"message": "Contact Test 788826 Completed Sponsorship!",
"type": "friend_completed_sponsorship",
"sponsorshipCharityId": "b2db72e6-a70e-414a-bf8b-558e6314e7ec",
"createdDate": "2020-06-25T17:35:17.816842Z",
"updatedDate": "2020-06-25T17:35:17.816876Z",
"createdTimestampEpochInMilliseconds": 1593021437817,
"updatedTimestampEpochInMilliseconds": 1593021437817
}
}
I need to get the 2 latests documents grouped by friendId with the latest document per friendId. The part of grouping by friendId with the latest document per friendId, works fine. However, I'm unable to sort the index by createdDate desc before the aggregation happens.
essentially, i'd like to sort the index by createdDate desc, before the aggregation takes place. I don't want to have a parent aggregate by createdDate since that wouldn't result in unique friendId. How can that be achieved?
It looks like you need to set the order property of your terms aggregation. By default they are ordered by hit count. You want them to be ordered by the max createdDate. So you should add a sub aggregation to calculate the max createdDate, and then you can use the ID of that aggregation to order the parent terms aggregation.

how do I get elasticsearch to always return the smallest field value ("url length")?

How do I always return the documents with the lowest value in the "url_length" field regardless of (from) that I sent to search?
in the query below, I request the results that have the word (netflix) and that the field (pgrk) is between 9 and 10 and that the field (url_length) is less than 4, but when I put it ("from": 1, "size ": 1) does not return the doc of (_id 15) that has the word (netflix) the field pgrk = 10 and the field (url_length) = 2. Returns the doc of (_id 14) that has the word (netflix) the field pgrk = 10 and the field (url_length) = 3
just return the doc of (_id 15) that has the field (url_length) = 2 if I put it in the query from ZERO ("from": 0, "size": 1)
because I had it searched ("from": 1, "size": 1,) and didn't bring the record of (_id 15) that has the ("url_length" = 2) brought the record of (_id 14) that has the ("url_length" = 3)
{
"from": 1,
"size": 1,
"sort": [
{
"pgrk": {
"order": "desc"
}
},
{
"url_length": {
"order": "asc"
}
}
],
"query": {
"bool": {
"must": {
"multi_match": {
"query": "netflix",
"type": "cross_fields",
"fields": [
"tittle",
"description",
"url"
],
"operator": "and"
}
},
"filter": [
{
"range": {
"pgrk": {
"gte": 9,
"lte" : 10
}
}
},
{
"range": {
"url_length": {
"lt" : 4
}
}
}
]
}
}
}
if I put ("from": 1, "size": 1,) it does not return the record (_id 15) that has "url_length = 2" returns the doc of _id 14 that has "url_length = 3" as shown below:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "teste",
"_type": "_doc",
"_id": "14",
"_score": null,
"_source": {
"url": "www.333.com",
"title": "netflix netflix netflix netflix netflix netflix netflix netflix netflix netflix",
"description": "tudo sobre netflix netflix netflix netflix netflix netflix",
"pgrk": "10",
"url_length": "3"
},
"sort": [
10,
3
]
}
]
}
}
if I put ("from": 0, "size": 1,) then it returns the record (_id 15) that has "url_length = 2"
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "teste",
"_type": "_doc",
"_id": "15",
"_score": null,
"_source": {
"url": "www.netflix.yahoo.com",
"title": "melhor filme",
"description": "tudo sobre series",
"pgrk": "10",
"url_length": "2"
},
"sort": [
10,
2
]
}
]
}
}
how do I always return the documents with the lowest value in the "url_length" field regardless of (from) that I sent to search?
follows my mapping:
{
"settings": {
"index": {
"number_of_shards": "5",
"number_of_replicas": "0",
"analysis": {
"filter": {
"stemmer_plural_portugues": {
"name": "minimal_portuguese",
"stopwords" : ["http", "https", "ftp", "www"],
"type": "stemmer"
}
},
"analyzer": {
"analyzer_customizado": {
"filter": [
"lowercase",
"stemmer_plural_portugues",
"asciifolding"
],
"tokenizer": "lowercase"
}
}
}
}
},
"mappings": {
"properties": {
"q": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"id": {
"type": "long"
},
"#timestamp": {
"type": "date"
},
"data": {
"type": "date"
},
"#version": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"quebrado": {
"type": "byte"
},
"pgrk": {
"type": "integer"
},
"url_length": {
"type": "integer"
},
"term": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"titulo": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"descricao": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"url": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
}
}

Why my ElasticSearch query does not fetch any records?

I'm running the following query :
{
"size": 50,
"_source" : ["servername", "silo", "packages.displayname", "packages.displayversion","environment"],
"query": {
"bool": {
"must": {
"match": {
"packages.displayname": "Google Chrome"
}
}
,
"must": {
"type": {
"value": "server"
}
}
}
}
}
But it doesn't fetch any records
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
However, the concerned index\type has some records where "packages.displayname" = "Google Chrome", below is a sample of the index\type
{
"took": 78,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 994,
"max_score": 1,
"hits": [
{
"_index": "package_conformity-13.02.2019",
"_type": "server",
"_id": "AWjklhaPsoJF1yu58sfg",
"_score": 1,
"_source": {
"environment": "PRD",
"servername": "Zephyr",
"packages": [
{
"displayname": "Google Chrome",
"displayversion": "71.0.3578.80"
},
here is the index mapping :
{
"package_conformity-13.02.2019": {
"mappings": {
"server": {
"properties": {
"environment": {
"type": "keyword"
},
"farm": {
"type": "keyword"
},
"packages": {
"type": "nested",
"properties": {
"InstallDate": {
"type": "date",
"index": false
},
"InstallLocation": {
"type": "text",
"index": false
},
"comments": {
"type": "text",
"index": false
},
"displayname": {
"type": "keyword"
},
"displayversion": {
"type": "keyword",
"index": false
},
"publisher": {
"type": "text",
"index": false
},
"regkey": {
"type": "keyword",
"index": false
}
}
},
"servername": {
"type": "keyword"
},
"silo": {
"type": "keyword"
},
"timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
}
}
}
}
}
}
Is there something wrong in the way of querying or in the index structure or content ? Please help me by pointing me to the right way..
Thanks
If you want multiple constraints inside your must clause, you need to have an array (and not repeat the must keyword multiple times). Also, the constraint on _type should be made differently, using a term query. Try this query instead:
{
"size": 50,
"_source": [
"servername",
"silo",
"packages.displayname",
"packages.displayversion",
"environment"
],
"query": {
"bool": {
"must": [
{
"nested": {
"path": "packages",
"query": {
"match": {
"packages.displayname": "Google Chrome"
}
}
}
},
{
"term": {
"_type": "server"
}
}
]
}
}
}

Elastic Search : Restricting the search result in array

My index metadata :
{
"never": {
"aliases": {},
"mappings": {
"userDetails": {
"properties": {
"Residence_address": {
"type": "nested",
"include_in_parent": true,
"properties": {
"Address_type": {
"type": "string",
"analyzer": "standard"
},
"Pincode": {
"type": "string",
"analyzer": "standard"
},
"address": {
"type": "string",
"analyzer": "standard"
}
}
}
}
}
},
"settings": {
"index": {
"creation_date": "1468850158519",
"number_of_shards": "5",
"number_of_replicas": "1",
"version": {
"created": "1060099"
},
"uuid": "v2njuC2-QwSau4DiwzfQ-g"
}
},
"warmers": {}
}
}
My setting :
POST never
{
"settings": {
"number_of_shards" : 5,
"analysis": {
"analyzer": {
"standard": {
"tokenizer": "keyword",
"filter" : ["lowercase","reverse"]
}
}
}
}
}
My data :
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.375,
"hits": [
{
"_index": "never",
"_type": "userDetails",
"_id": "1",
"_score": 0.375,
"_source": {
"Residence_address": [
{
"address": "Omega Residency",
"Address_type": "Owned",
"Pincode": "500004"
},
{
"address": "Collage of Engineering",
"Address_type": "Rented",
"Pincode": "411005"
}
]
}
}
]
}
}
My query :
POST /never/_search?pretty
{
"query": {
"match": {
"Residence_address.address": "Omega"
}
}
}
My Result :
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.375,
"hits": [
{
"_index": "never",
"_type": "userDetails",
"_id": "1",
"_score": 0.375,
"_source": {
"Residence_address": [
{
"address": "Omega Residency",
"Address_type": "Owned",
"Pincode": "500004"
},
{
"address": "Collage of Engineering",
"Address_type": "Rented",
"Pincode": "411005"
}
]
}
}
]
}
}
Is there any way to restrict my result to only object containing address = Omega Residency and NOT the other object having address = Collage of Engineering?
You can only do it with nested query and inner_hits. I see that you have include_in_parent: true and not using nested queries though. If you only want to get the matched nested objects you'd need to use inner_hits from nested queries:
GET /never/_search?pretty
{
"_source": false,
"query": {
"nested": {
"path": "Residence_address",
"query": {
"match": {
"Residence_address.address": "Omega Residency"
}
},
"inner_hits" : {}
}
}
}

Resources