Update first document via Update By Query API - elasticsearch

I'm trying to get Elasticsearch to do the same thing that MongoDB does with the findOneAndUpdate method, but this doesn't seem to be possible.
The use case is that multiple servers and threads will look into the specific index for the next task to complete.
Therefore my best bet would be to update the "next" task/document with a unique ID and then retrieve the document afterwards.
This query will give me the next document to retrieve:
GET /test_index/_search
{
"query": {
"bool": {
"must_not": {
"exists": {
"field": "next_id"
}
}
}
},
"sort": {
"next_update": {"order": "asc"}
},
"size": 1
}
But I can't seem to figure out how to use the Update By Query API to update only a single row. I've been trying this query, but it updates every found document:
POST /test_index/_update_by_query
{
"query": {
"bool": {
"must_not": {
"exists": {
"field": "next_id"
}
}
}
},
"sort": {
"next_update": {"order": "asc"}
},
"script": {
"source": "ctx._source['next_update'] = params.next_id",
"params": {
"next_id": "xxxx"
}
}
}
How can I solve this?

You can use max_docs param in _update_by_query and set value to 1 so it will be executed for only one document.
You can check this documentation.
POST /test_index/_update_by_query
{
"query": {
"bool": {
"must_not": {
"exists": {
"field": "next_id"
}
}
}
},
"max_docs": 1,
"sort": {
"next_update": {"order": "asc"}
},
"script": {
"source": "ctx._source['next_update'] = params.next_id",
"params": {
"next_id": "xxxx"
}
}
}

Related

Elasticsearch: How to search with all inputs only

I am looking for a solution to the problem
Problem:
i have two records A:Trace(id, traceId, Tags) B:Trace(id,traceId, Tags)
both records have same traceId and different tags
for that i used should Clause which return data even if only record A have tag in it. But what i want is that if query have tags that are not in records there should be empty response.
this is the query i actually used on Zipkin ELasticsearch Data
GET zipkin-span-2021-12-08/_search?size=10
{
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"term": {
"_q": "smpp.charged=false"
}
},
{
"term": {
"_q": "connection.type=WEEK"
}
},
{
"term": {
"_q": "connection.type=a"
}
}
]
}
}
]
}
},
"aggs": {
"same_treace_id": {
"terms": {
"field": "traceId",
"size": 10,
"min_doc_count": 2
}
}
},
"fields": [
"traceId"
],
"_source": true
}

Elasticsearch: Multiple fields, return true if any exists

ES version: 5.2 alpine
I have a document like this:
{
"field1": null,
"field2": "xyz"
"field3": null
}
I want to return the document if any of the fields above exists/not null.
"filter": {
"bool": {
"must": [
{
"exists": {
"field": ["field1","field2"]
}
}]
}
}
but I get following error.
[exists] unknown token [START_ARRAY] after [field]
Any idea how to do so with this ES version?
Thanks.
You cannot pass multiple fields in exists query. Use exists query for each field and wrap them in should clause as below:
{
"query": {
"bool": {
"should": [
{
"exists": {
"field": "field1"
}
},
{
"exists": {
"field": "field2"
}
},
{
"exists": {
"field": "field3"
}
}
]
}
}
}
I am not sure you want to return the doc if field value exist, or also the value. This is for returning documents which satisfies the condition.
This can help,
You can use should query with minimum should match as 1.
"filter": {
"bool": {
"should": [
{ "exists": { "field": "field1 } },
{ "exists": { "field": "field2 } },
],
"minimum_should_match" : 1,
}
}
Try it, I hope it will work in your version as well.

Need aggregation of only the query results

I need to do an aggregation but only with the limited results I get form the query, but it is not working, it returns other results outside the size limit of the query. Here is the query I am doing
{
"size": 500,
"query": {
"bool": {
"must": [
{
"term": {
"tags.keyword": "possiblePurchase"
}
},
{
"term": {
"clientName": "Ci"
}
},
{
"range": {
"firstSeenDate": {
"gte": "now-30d"
}
}
}
],
"must_not": [
{
"term": {
"tags.keyword": "skipPurchase"
}
}
]
}
},
"sort": [
{
"firstSeenDate": {
"order": "desc"
}
}
],
"aggs": {
"byClient": {
"terms": {
"field": "clientName",
"size": 25
},
"aggs": {
"byTarget": {
"terms": {
"field": "targetName",
"size": 6
},
"aggs": {
"byId": {
"terms": {
"field": "id",
"size": 5
}
}
}
}
}
}
}
}
I need the aggregations to only consider the first 500 results of the query, sorted by the field I am requesting on the query. I am completely lost. Thanks for the help
Scope of the aggregation is the number of hits of your query, the size parameter is only used to specify the number of hits to fetch and display.
If you want to restrict the scope of the aggregation on the first n hits of a query, I would suggest the sampler aggregation in combination with your query

Terrible has_child query performance

The following query has terrible performance.
100% sure it is the has_child. Query without it runs under 300ms, with it it takes 9 seconds.
Is there some better way to use the has_child query? It seems like I could query parents, and then children by id and then join client side to do the has child check faster than the ES database engine is doing it...
{
"query": {
"filtered": {
"query": {
"bool": {
"must": [
{
"has_child": {
"type": "status",
"query": {
"term": {
"stage": "s3"
}
}
}
},
{
"has_child": {
"type": "status",
"query": {
"term": {
"stage": "es"
}
}
}
}
]
}
},
"filter": {
"bool": {
"must": [
{
"term": {
"source": "IntegrationTest-2016-03-01T23:31:15.023Z"
}
},
{
"range": {
"eventTimestamp": {
"from": "2016-03-01T20:28:15.028Z",
"to": "2016-03-01T23:33:15.028Z"
}
}
}
]
}
}
}
},
"aggs": {
"digests": {
"terms": {
"field": "digest",
"size": 0
}
}
},
"size": 0
}
Cluster info:
CPU and memory usage is low. It is AWS ES Service cluster (v1.5.2). Many small documents, and since version aws is running is old, doc values aren't on by default. Not sure if that is helping or hurting.
Since "stage" is not analyzed (based on your comment) and, therefore, you are not interested in scoring the documents that match on that field, you might realize slight performance gains by using the has_child filter instead of the has_child query. And using a term filter instead of a term query.
In the documentation for has_child, you'll notice:
The has_child filter also accepts a filter instead of a query:
The main performance benefits of using a filter come from the fact that Elasticsearch can skip the scoring phase of the query. Also, filters can be cached which should improve the performance of future searches that use the same filters. Queries, on the other hand, cannot be cached.
Try this instead:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"source": "IntegrationTest-2016-03-01T23:31:15.023Z"
}
},
{
"range": {
"eventTimestamp": {
"from": "2016-03-01T20:28:15.028Z",
"to": "2016-03-01T23:33:15.028Z"
}
}
},
{
"has_child": {
"type": "status",
"filter": {
"term": {
"stage": "s3"
}
}
}
},
{
"has_child": {
"type": "status",
"filter": {
"term": {
"stage": "es"
}
}
}
}
]
}
}
}
},
"aggs": {
"digests": {
"terms": {
"field": "digest",
"size": 0
}
}
},
"size": 0
}
I bit the bullet and just performed the parent:child join in my application. Instead of waiting 7 seconds for the has_child query, I fire off two consecutive term queries and do some post processing: 200ms.

Select distinct values of bool query elastic search

I have a query that gets me some user post data from an elastic index. I am happy with that query, though I need to make it return rows with unique usernames. Current, it displays relevant posts by users, but it may display one user twice..
{
"query": {
"bool": {
"should": [
{ "match_phrase": { "gtitle": {"query": "voice","boost": 1}}},
{ "match_phrase": { "gdesc": {"query": "voice","boost": 1}}},
{ "match": { "city": {"query": "voice","boost": 2}}},
{ "match": { "gtags": {"query": "voice","boost": 1} }}
],"must_not": [
{ "term": { "profilepicture": ""}}
],"minimum_should_match" : 1
}
}
}
I have read about aggregations but didn't understand much (also tried to use aggs but didn't work either).... any help is appreciated
You would need to use terms aggregation to get all unique users and then use top hits aggregation to get only one result for each user. This is how it looks.
{
"query": {
"bool": {
"should": [
{
"match_phrase": {
"gtitle": {
"query": "voice",
"boost": 1
}
}
},
{
"match_phrase": {
"gdesc": {
"query": "voice",
"boost": 1
}
}
},
{
"match": {
"city": {
"query": "voice",
"boost": 2
}
}
},
{
"match": {
"gtags": {
"query": "voice",
"boost": 1
}
}
}
],
"must_not": [
{
"term": {
"profilepicture": ""
}
}
],
"minimum_should_match": 1
}
},
"aggs": {
"unique_user": {
"terms": {
"field": "userid",
"size": 100
},
"aggs": {
"only_one_post": {
"top_hits": {
"size": 1
}
}
}
}
},
"size": 0
}
Here size inside user aggregation is 100, you can increase that if you have more unique users(default is 10), also the outermost size is zero to get only aggregation results. One important thing to remember is your user ids have to be unique, i.e ABC and abc will be considered different users, you might have to make your userid not_analyzed to be sure about that. More on that.
Hope this helps!!

Resources