Distinct records with geo_distance sort on aggregation ES - elasticsearch

I'm working on nearby API using elasticsearch.
I'm trying to run 4 actions in ES query
match condition (here running a script to get records within radius)
get distinct records based on company's Key (want to get one record from a company)
sort records based on geo_distance
add the field as Distance to get the distance between user and location
Here is my code:
const query = {
query: {
bool: {
must: [
customQuery,
{
term: {
"schedule.isShopOpen": true,
},
},
{
term: {
isBranchAvailable: true,
},
},
{
term: {
branchStatus: "active",
},
},
{
match:{
shopStatus: "active"
}
},
{
script: {
script: {
params: {
lat: parseFloat(req.lat),
lon: parseFloat(req.lon),
},
source:
"doc['location'].arcDistance(params.lat, params.lon) / 1000 <= doc['searchRadius'].value",
lang: "painless",
},
},
},
],
},
},
aggs: {
duplicateCount: {
terms: {
field: "companyKey",
size: 10000,
},
aggs: {
duplicateDocuments: {
top_hits: {
sort: [
{
_geo_distance: {
location: {
lat: parseFloat(req.lat),
lon: parseFloat(req.lon),
},
order: "asc",
unit: "km",
mode: "min",
distance_type: "arc",
ignore_unmapped: true,
},
},
],
script_fields: {
distance: {
script: {
params: {
lat: parseFloat(req.lat),
lon: parseFloat(req.lon),
},
inline: `doc['location'].arcDistance(params.lat, params.lon)/1000`,
},
},
},
stored_fields: ["_source"],
size: 1,
},
},
},
},
},
};
Here's the out put:
data: [
{
companyKey: "1234",
companyName: "Floward",
branchKey: "3425234",
branch: "Mursilat",
distance: 1.810064121687324,
},
{
companyKey: "0978",
companyName: "Dkhoon",
branchKey: "352345",
branch: "Wahah blue branch ",
distance: 0.08931851500047634,
},
{
companyKey: "567675",
companyName: "Abdulaziz test",
branchKey: "53425",
branch: "Jj",
distance: 0.011447273197846672,
},
{
companyKey: "56756",
companyName: "Mouj",
branchKey: "345345",
branch: "King fahad",
distance: 5.822936713752124,
},
];
I have two issues
How to sort records based on geo_distance
will query actions(match, script) apply to aggregation data...?
Can you please help me out to solve these issues

This would be more appropriate query for your use case
{
"query": {
"bool": {
"filter": [
{
"geo_distance": {
"distance": "200km",
"distance_type": "arc",
"location": {
"lat": 40,
"lon": -70
}
}
},
{
"match": {
"shopStatus": "active"
}
}
]
}
},
"collapse": {
"field": "companyKey"
},
"sort": [
{
"_geo_distance": {
"location": {
"lat": 40,
"lon": 71
},
"order": "asc",
"unit": "km",
"mode": "min",
"distance_type": "arc",
"ignore_unmapped": true
}
}
],
"_source": ["*"],
"script_fields": {
"distance_in_m": {
"script": "doc['location'].arcDistance(40, -70)" // convert to unit required
}
}
}
Filter instead of must - since you are just filtering documents, filter will be faster as it does not score documents unlike must
collapse
You can use the collapse parameter to collapse search results based on field values. The collapsing is done by selecting only the top sorted document per collapse key.
Geo distance instead of script -- to find documents with in distance
script field to get distance

Related

OpenSearch / ElasticSearch index mappings

I have a system that ingests multiple scores for events and we use opensearch (previously elastic search) for getting the averages.
For example, an input would be similar to:
// event 1
{
id: "foo1",
timestamp: "some-iso8601-timestamp",
scores: [
{ name: "arbitrary-name-1", value: 80 },
{ name: "arbitrary-name-2", value: 55 },
{ name: "arbitrary-name-3", value: 30 },
]
}
// event 2
{
id: "foo2",
timestamp: "some-iso8601-timestamp",
scores: [
{ name: "arbitrary-name-1", value: 90 },
{ name: "arbitrary-name-2", value: 65 },
{ name: "arbitrary-name-3", value: 40 },
]
}
The score name are arbitrary and subject to change from time to time.
We ultimately would like to query the data to get the average scores values:
[
{ name: "arbitrary-name-1", value: 85 },
{ name: "arbitrary-name-2", value: 60 },
{ name: "arbitrary-name-3", value: 35 },
]
However, the only way we have been able to achieve this so far has been to insert multiple documents, one for each score name/value pair in each event. This seems wasteful. The search in place currently is to group the documents by score name and timestamp intervals, then to perform a weighted average of the scores in each bucket.
Is there a way the data can be inserted to allow this query pattern to take place by only adding one document into opensearch per event/record (rather than one document per score per event/record)? How might that look?
Thanks!
Is it what you were trying to do ?
I got a bit confused. ^^
DELETE /71397606
PUT /71397606
{
"mappings": {
"properties": {
"id": {
"type": "text"
},
"scores": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "long"
}
}
},
"timestamp": {
"type": "text"
}
}
}
}
POST /_bulk
{"index":{"_index":"71397606"}}
{"id":"foo1","timestamp":"some-iso8601-timestamp","scores":[{"name":"arbitrary-name-1","value":80},{"name":"arbitrary-name-2","value":55},{"name":"arbitrary-name-3","value":30}]}
{"index":{"_index":"71397606"}}
{"id":"foo2","timestamp":"some-iso8601-timestamp","scores":[{"name":"arbitrary-name-1","value":90},{"name":"arbitrary-name-2","value":65},{"name":"arbitrary-name-3","value":40}]}
{"index":{"_index":"71397606"}}
{"id":"foo2","timestamp":"some-iso8601-timestamp","scores":[{"name":"arbitrary-name-1","value":85},{"name":"arbitrary-name-x","value":65},{"name":"arbitrary-name-y","value":40}]}
GET /71397606/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"nested": {
"nested": {
"path": "scores"
},
"aggs": {
"pername": {
"terms": {
"field": "scores.name",
"size": 10
},
"aggs": {
"avg": {
"avg": {
"field": "scores.value"
}
}
}
}
}
}
}
}
ps:
If not could you give an example ?

Elastic search exclude a nested element from search results, get element by id

I have item with nested objects:
{
"name": "The Amazon rainforest",
"id": "610d33da26c25b00191ebcbe",
"tags": [
{
"name": "Brazil",
"verified": 1
},
{
"name": "new_tag",
"verified": 0,
}
],
}
in search results unverified tag should be omitted:
output of search by id: 610d33da26c25b00191ebcbe
{
"name": "The Amazon rainforest",
"id": "610d33da26c25b00191ebcbe",
"tags": [
{
"name": "Brazil",
"verified": 1
}
],
}
Node.js version of answer:
const { body } = await elasticWrapper.client.search({
index: ElasticIndexs.Products,
filter_path:
'hits.hits._source*, hits.hits.inner_hits.tags.hits.hits._source*',
body: {
_source: {
excludes: ['tags'],
},
query: {
bool: {
must: [
{
match: {
id: req.params.id,
},
},
],
should: [
{
nested: {
path: 'tags',
query: {
term: {
'tags.verified': 1,
},
},
inner_hits: {},
},
},
],
},
},
},
});

elasticsearch nested query returns only last 3 results

We have the following elasticsearch mapping
{
index: 'data',
body: {
settings: {
analysis: {
analyzer: {
lowerCase: {
tokenizer: 'whitespace',
filter: ['lowercase']
}
}
}
},
mappings: {
// used for _all field
_default_: {
index_analyzer: 'lowerCase'
},
entry: {
properties: {
id: { type: 'string', analyzer: 'lowerCase' },
type: { type: 'string', analyzer: 'lowerCase' },
name: { type: 'string', analyzer: 'lowerCase' },
blobIds: {
type: 'nested',
properties: {
id: { type: 'string' },
filename: { type: 'string', analyzer: 'lowerCase' }
}
}
}
}
}
}
}
and a sample document that looks like the following:
{
"id":"5f02e9dae252732912749e13",
"type":"test_type",
"name":"test_name",
"creationTimestamp":"2020-07-06T09:07:38.775Z",
"blobIds":[
{
"id":"5f02e9dbe252732912749e18",
"filename":"test1.csv"
},
{
"id":"5f02e9dbe252732912749e1c",
"filename":"test2.txt"
},
// removed in-between elements for simplicity
{
"id":"5f02e9dbe252732912749e1e",
"filename":"test3.csv"
},
{
"id":"5f02e9dbe252732912749e58",
"filename":"test4.txt"
},
{
"id":"5f02e9dbe252732912749e5a",
"filename":"test5.csv"
},
{
"id":"5f02e9dbe252732912749e5d",
"filename":"test6.txt"
}
]
}
I have the following ES query which is querying documents for a certain timerange based on the creationTimestamp field and then filtering the nested field blobIds based on a user query, that should match the blobIds.filename field.
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"range": {
"creationTimestamp": {
"gte": "2020-07-01T09:07:38.775Z",
"lte": "2020-07-07T09:07:40.147Z"
}
}
},
{
"nested": {
"path": [
"blobIds"
],
"query": {
"query_string": {
"fields": [
"blobIds.filename"
],
"query": "*"
}
},
// returns the actual blobId hit
// and not the whole array
"inner_hits": {}
}
},
{
"query": {
"query_string": {
"query": "+type:*test_type* +name:*test_name*"
}
}
}
]
}
}
}
},
"sort": [
{
"creationTimestamp": {
"order": "asc"
},
"id": {
"order": "asc"
}
}
]
}
The above entry is clearly matching the query. However, it seems like there is something wrong with the returned inner_hits, since I always get only the last 3 blobIds elements instead of the whole array that contains 24 elements, as can be seen below.
{
"name": "test_name",
"creationTimestamp": "2020-07-06T09:07:38.775Z",
"id": "5f02e9dae252732912749e13",
"type": "test_type",
"blobIds": [
{
"id": "5f02e9dbe252732912749e5d",
"filename": "test4.txt"
},
{
"id": "5f02e9dbe252732912749e5a",
"filename": "test5.csv"
},
{
"id": "5f02e9dbe252732912749e58",
"filename": "test6.txt"
}
]
}
I find it very strange since I'm only doing a simple * query.
Using elasticsearch v1.7 and cannot update at the moment

Filter document on items in an array ElasticSearch

I am using ElasticSearch to search through documents. However, I need to make sure the current user is able to see those documents. Each document is tied to a community, in which the user may belong.
Here is the mapping for my Document:
export const mapping = {
properties: {
amazonId: { type: 'text' },
title: { type: 'text' },
subtitle: { type: 'text' },
description: { type: 'text' },
createdAt: { type: 'date' },
updatedAt: { type: 'date' },
published: { type: 'boolean' },
communities: { type: 'nested' }
}
}
I'm currently saving the ids of the communities the document belongs to in an array of strings. Ex: ["edd05cd0-0a49-4676-86f4-2db913235371", "672916cf-ee32-4bed-a60f-9a7c08dba04b"]
Currently, when I filter a query with {term: { communities: community.id } }, it returns all the documents, regardless of the communities it's tied to.
Here's the full query:
{
index: 'document',
filter_path: { filter: {term: { communities: community.id } } },
body: {
sort: [{ createdAt: { order: 'asc' } }]
}
}
This is the following result based on the community id of "b7d28e7f-7534-406a-981e-ddf147b5015a". NOTE: This is a return from my graphql, so the communities on the document are actual full objects after resolving the hits from the ES query.
"hits": [
{
"title": "The One True Document",
"communities": [
{
"id": "edd05cd0-0a49-4676-86f4-2db913235371"
},
{
"id": "672916cf-ee32-4bed-a60f-9a7c08dba04b"
}
]
},
{
"title": "Boring Document 1",
"communities": []
},
{
"title": "Boring Document 2",
"communities": []
},
{
"title": "Unpublished",
"communities": [
{
"id": "672916cf-ee32-4bed-a60f-9a7c08dba04b"
}
]
}
]
When I attempt to map the communities as {type: 'keyword', index: 'not_analyzed'} I receive an error that states, [illegal_argument_exception] Could not convert [communities.index] to boolean.
So do I need to change my mapping, my filter, or both? Searching around the docs for 6.6, I see that terms needs the non_analyzed mapping.
UPDATE --------------------------
I updated the communities mapping to be a keyword as suggested below. However, I still received the same result.
I updated my query to the following (using a community id that has documents):
query: { index: 'document',
body:
{ sort: [ { createdAt: { order: 'asc' } } ],
from: 0,
size: 5,
query:
{ bool:
{ filter:
{ term: { communities: '672916cf-ee32-4bed-a60f-9a7c08dba04b' } } } } } }
Which gives me the following results:
{
"data": {
"communities": [
{
"id": "672916cf-ee32-4bed-a60f-9a7c08dba04b",
"feed": {
"documents": {
"hits": []
}
}
}
]
}
}
Appears that my filter is working too well?
Since you are storing ids of communities you should make sure that the ids doesn't get analysed. For this communities should be of type keyword. Second you want to store array of community ids since a user can belong to multiple communities. To do this you don't need to make it of type nested. Nested has all together different use case.
To sore values as array you need to make sure that while indexing you are always passing the values against the field as array even if the value is single value.
You need to change mapping and the way you are indexing values against field communities.
1. Update mapping as below:
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"amazonId": {
"type": "text"
},
"title": {
"type": "text"
},
"subtitle": {
"type": "text"
},
"description": {
"type": "text"
},
"createdAt": {
"type": "date"
},
"updatedAt": {
"type": "date"
},
"published": {
"type": "boolean"
},
"communities": {
"type": "keyword"
}
}
}
}
}
2. Adding a document to index:
PUT my_index/_doc/1
{
"title": "The One True Document",
"communities": [
"edd05cd0-0a49-4676-86f4-2db913235371",
"672916cf-ee32-4bed-a60f-9a7c08dba04b"
]
}
3. Filtering by community id:
GET my_index/_doc/_search
{
"query": {
"bool": {
"filter": [
{
"term": {
"communities": "672916cf-ee32-4bed-a60f-9a7c08dba04b"
}
}
]
}
}
}
Nested Field approach
1. Mapping:
PUT my_index_2
{
"mappings": {
"_doc": {
"properties": {
"amazonId": {
"type": "text"
},
"title": {
"type": "text"
},
"subtitle": {
"type": "text"
},
"description": {
"type": "text"
},
"createdAt": {
"type": "date"
},
"updatedAt": {
"type": "date"
},
"published": {
"type": "boolean"
},
"communities": {
"type": "nested"
}
}
}
}
}
2. Indexing document:
PUT my_index_2/_doc/1
{
"title": "The One True Document",
"communities": [
{
"id": "edd05cd0-0a49-4676-86f4-2db913235371"
},
{
"id": "672916cf-ee32-4bed-a60f-9a7c08dba04b"
}
]
}
3. Querying (used of nested query):
GET my_index_2/_doc/_search
{
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "communities",
"query": {
"term": {
"communities.id.keyword": "672916cf-ee32-4bed-a60f-9a7c08dba04b"
}
}
}
}
]
}
}
}
You might be noticing I used communities.id.keyword and not communities.id. To understand the reason for this go through this.

Inner hits on grandparents still not working

I have problems retrieving the inner_hits of my "grandparent" items.
Parents from a child query works fine, but cant get it to return also the ones one more level up.
Any ideas of this?
The known issue for this should be fixed by now (2.3) and the workaround are written according to nested objects, not parent/child hierarchy data, so cant get it to work for me.
Code in Sense-format:
POST /test/child/_search
{
"query": {
"has_parent": {
"type": "parent",
"query": {
"has_parent": {
"type": "grandparent",
"query": {
"match_all": {}
},
"inner_hits": {}
}
},
"inner_hits": {}
}
}
}
PUT /test/child/3?parent=2&routing=1
{
"id": 3,
"name": "child",
"parentid": 2
}
PUT /test/parent/2?parent=1&routing=1
{
"id": 2,
"name": "parent",
"parentid": 1
}
PUT /test/grandparent/1
{
"id": 1,
"name": "grandparent"
}
PUT /test
{
"mappings": {
"grandparent": {},
"parent": {
"_parent": {
"type": "grandparent"
}
},
"child": {
"_parent": {
"type": "parent"
}
}
}
}
this is sample code for finding grand parent
const filterPath = `hits.hits.inner_hits.activity.hits.hits.inner_hits.user.hits.hits._source*,
hits.hits.inner_hits.activity.hits.hits.inner_hits.user.hits.hits.inner_hits.fofo.hits.hits._source*`;
const source = ['id', 'name', 'thumbnail'];
const { body } = await elasticWrapper.client.search({
index: ElasticIndex.UserDataFactory,
filter_path: filterPath,
_source: source,
body: {
from,
size,
query: {
bool: {
must: [
{
match: {
relation_type: ElasticRelationType.Like,
},
},
{
has_parent: {
parent_type: ElasticRelationType.Post,
query: {
bool: {
must: [
{
term: {
id: {
value: req.params.id,
},
},
},
{
has_parent: {
parent_type: ElasticRelationType.User,
query: {
bool: {
must: [
{
exists: {
field: 'id',
},
},
],
should: [
{
has_child: {
type: ElasticRelationType.Follower,
query: {
bool: {
minimum_should_match: 1,
should: [
{
match: {
follower:
req.currentUser?.id,
},
},
{
match: {
following:
req.currentUser?.id,
},
},
],
},
},
inner_hits: {
_source: [
'follower',
'following',
'status',
],
},
},
},
],
},
},
inner_hits: {
_source: ['id', 'name', 'thumbnail'],
},
},
},
],
},
},
inner_hits: {},
},
},
],
},
},
sort: [
{
createdAt: {
order: 'desc',
},
},
],
},
});

Resources