Extracting and aggregating text values from a field with Elasticsearch - elasticsearch

Im new to ElasticSearch, and my first mission is as follow:
Here is the beginning of my query:
"_source": "pageVisitsValues",
"query": {
"filtered": {
"filter": {
"and": {
"filters": [
{
"term": {
"blockSize": 30
}
},
{
"range": {
"pageViews": {
"from": 1,
"to": null,
"include_lower": true,
"include_upper": true
}
}
},
{
"bool": {
"should": {
"bool": {
"must": {
"regexp": {
"pageVisitsValues": {
"value": ".*utm_medium=.*"
}
}
}
}
}
}
}
]
}
}
}
},
"aggregations": {
"Utm_term1": {
"terms": {
"field": "pageVisitsValues",
"size": 50,
"order": {
"_count": "desc"
}
}
},
now here is part of the result:
"hits": { -
"total": 204223,
"max_score": 1,
"hits": [ -
{ -
"_index": "my_index",
"_type": "user",
"_id": "AX45WhAzfmq",
"_score": 1,
"_source": { -
"pageVisitsValues": [ -
"www.test.com/search?&q=bullion deals&utm_source=iterable&utm_medium=email&utm_campaign=weeklydeals"
]
}
},
{ -
"_index": "8767827_all_25216812968619266m",
"_type": "user",
"_id": "AX45vDJEamKugpqPWxqJ",
"_score": 1,
"_source": { -
"pageVisitsValues": [ -
"www.mytestsite.com/search?&q=indian",
"www.mytestsite.com/search?&q=indianhead cents&rows=80&view=grid&version=v2&start=160",
"www.mytestsite.com/product/24158/1867-indianvg",
"www.mytestsite.com/search?q=&x=11&y=7&silver&page=1&utm_source=bing&utm_medium=cpc&utm_campaign=cpa&utm_term=testx&utm_content=test",
"www.mytestsite.com/product/233871/flying-eagle-indian-head-cent-1856-1909",
"www.mytestsite.com/search?&q=silver eagles bu"
]
}
},
{ -
"_index": "my_index",
"_type": "user",
"_id": "userID",
"_score": 1,
"_source": { -
"pageVisitsValues": [ -
"www.mytestsite.com/product/1/bu-random-year?utm_source=criteo&utm_medium=email&utm_campaign=prospecting&dclid=cpyz4_j54fqcfcdbpaqd0dkkiw"
]
}
}
what I need to do is to find a way to aggregate all "utm_medium" values from the "pageVisitsValues" field, so my expected result should be something like this:
Utm_term1": { -
"doc_count_error_upper_bound": 426,
"sum_other_doc_count": 1557591,
"buckets": [ -
{ -
"key": "email",
"doc_count": 31283
},
{ -
"key": "cpc",
"doc_count": 23615
Any idea how can I do that?

I'd suggest adding a new list to ES and aggregating on it since it would be less expensive, but if you don't want to do that, you can create a dynamic variable per record runtime_mappings and do a term aggregation on it, check the script section in the official docs for Elasticsearch.
To extract the keyword needed using painless script, you can use the .split function, check this ticket out.
If you need any further help please reply to this comment.

Related

Create the Elastic search query to show Random 5 Questions by category

I Have fields Category & Questions in the Table.
My Requirement is for the below mentioned 3 category against I need the questions which is tagged (SO I want the Category and Questions field in the query) by writing elastic search query
Category :
OLA
BNA
DRG
GET logstash-sdc-feedback/_search? { "_source":["Category.keyword"], "size": 5, "query":{ "bool": { "must": [ {"match":{"Category.keyword"": "OLA","BNA","DRG"}}
],
}
}, "aggs": { "MyBuckets": { "terms": { "field": "questions.keyword","Category.keyword" "order":{ "_count": "asc" }, "size": "5"
} } } }
You can use terms query along with terms aggregation, to achieve your use case.
Adding a working example
Index Data:
{
"category": "XYZ",
"question": "d"
}
{
"category": "OLA",
"question": "a"
}
{
"category": "BNA",
"question": "b"
}
{
"category": "DRG",
"question": "c"
}
Search Query:
{
"query": {
"bool": {
"must": {
"terms": {
"category.keyword": [
"OLA",
"BNA",
"DRG"
]
}
}
}
},
"aggs": {
"top_tags": {
"terms": {
"field": "category.keyword"
},
"aggs": {
"top_faq_hits": {
"top_hits": {
"_source": {
"includes": [
"question"
]
},
"size": 1
}
}
}
}
}
}
Search Result:
"aggregations": {
"top_tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "BNA", // note this
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"question": "b" // note this
}
}
]
}
}
},
{
"key": "DRG",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"question": "c"
}
}
]
}
}
},
{
"key": "OLA",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"question": "a"
}
}
]
}
}
}
]
}
}

elasticsearch parent/child query logic

elastic version: 5.0.1
define mappingļ¼š
PUT test
{
"mappings": {
"my_parent": {
"properties": {
"key": {
"type": "keyword"
}
}
},
"my_child": {
"_parent": {
"type": "my_parent"
},
"properties": {
"key": {
"type": "keyword"
}
}
}
}
}
add demo data:
POST _bulk
{"update": {"_index": "test","_type": "my_parent","_id": "1"}}
{"doc": {"key": 1},"doc_as_upsert": true}
{"update": {"_index": "test","_type": "my_child","_parent": 1,"_id": "11"}}
{"doc": {"key": 11},"doc_as_upsert": true}
{"update": {"_index": "test","_type": "my_child","_parent": 1,"_id": "12"}}
{"doc": {"key": 12},"doc_as_upsert": true}
query:
POST test/my_parent/_search
{
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"term": {
"key": 3
}
},
{
"has_child": {
"type": "my_child",
"inner_hits": {
"name": "a"
},
"query": {
"term": {
"key": 11
}
}
}
}
]
}
},
{
"has_child": {
"type": "my_child",
"inner_hits": {
"name": "b"
},
"query": {
"term": {
"key": 12
}
}
}
}
]
}
}
}
}
}
result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": [
{
"_index": "test",
"_type": "my_parent",
"_id": "1",
"_score": 0,
"_source": {
"key": 1
},
"inner_hits": {
"a": {
"hits": {
"total": 1,
"max_score": 0.9808292,
"hits": [
{
"_type": "my_child",
"_id": "11",
"_score": 0.9808292,
"_routing": "1",
"_parent": "1",
"_source": {
"key": 11
}
}
]
}
},
"b": {
"hits": {
"total": 1,
"max_score": 0.9808292,
"hits": [
{
"_type": "my_child",
"_id": "12",
"_score": 0.9808292,
"_routing": "1",
"_parent": "1",
"_source": {
"key": 12
}
}
]
}
}
}
}
]
}
}
question here:
Do 'must'\'should'\'must_not' clause have the same meaning between plain search and parent\child search?
Why the result of inner_hits with name 'a' is returned?
'must'|'should'|'must_not' clauses have different meaning. Let me explain you with example of the plain search.
Understand these clause with equivalent SQL query.
must: The clause (query) must appear in matching documents and will contribute to the score.
SQL: select * from user where country_code = 'US' AND state_code = 'NY'
Query DSL:
POST _search
{
"query": {
"bool": {
"must": [
{"term": {"country_code": "US"}},
{"term": {"state_code": "NY"}}
]
}
}
}
should: At least one of these clauses must match, like logical OR.
SQL: select * from user where country_code = 'US' OR state_code = 'NY'
Query DSL:
POST _search
{
"query": {
"bool": {
"should": [
{"term": {"country_code": "US"}},
{"term": {"state_code": "NY"}}
]
}
}
}
must_not: Condition must not match the documents.
SQL: select * from user where country_code != 'US' AND state_code != 'NY'
Query DSL:
POST _search
{
"query": {
"bool": {
"must_not": [
{"term": {"country_code": "US"}},
{"term": {"state_code": "NY"}}
]
}
}
}
Why the result of inner_hits with name 'a' is returned?
Because you put two has_child condition inside the should filter. As explain above it is matching the document from (inner_hits.name =a ..) OR ( inner_hits.name=b ..)

Elasticsearch aggregation with custom query parser

I cannot seem to aggregate my query results when using my custom query parser. I get a result set by these are not aggregated. When using a standard query parser like match everything turns out well.
What works:
GET pages/_search
{
"query": {
"match": {
"text": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
returns a nice aggregated result:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 11.11176,
"hits": [
...
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6lzDazctHi91RE",
"_score": 3.3503218,
"_source": {
"rooturl": "document",
"type": "equation",
"url": "document:poly",
"text": "coefficient"
}
},
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6xzDazctHi91RF",
"_score": 3.3503218,
"_source": {
"rooturl": document",
"type": "equation",
"url": "document:poly",
"text": "dot"
}
}
...
]
},
"aggregations": {
"docs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "document",
"doc_count": 10
}
]
}
}
}
But when using my custom query parser, The result is not aggregated.
Query:
GET pages/_search
{
"query": {
"my_custom_query_parser": {
"query": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
Can anyone point me into the right direction?

How to sort bucket result based on viewed_timestamp in ElasticSearch?

I am new to Elastic Search. I want to find the top 10 unique recent visited doc_id.
I have done first aggregation on doc_id and added sub-aggregation to sort each group and get a single result. Now I want to sort this bucket.
I am not able to sort the bucket's result based on view_timestamp. How can I add order during first aggregation?
I have tried other solutions given on stack overflow, but it is not working for me. Can anyone help me to solve this problem?
Query
{
"query": {
"constant_score": {
"filter": {
"term": { "username": "nil#gmail.com" }
}
}
},
"size":0,
"aggs":{
"title": {
"terms": {
"field": "doc_id",
"size":0
}
,
"aggs": {
"top": {
"top_hits": {
"sort": [
{
"viewed_timestamp": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
Bucket result:
{
"aggregations": {
"title": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "b003",
"doc_count": 3,
"top_tag_hits": {
"hits": {
"total": 3,
"max_score": null,
"hits": [{
"_index": "visitedData",
"_type": "userdoc",
"_id": "AVak51Sp",
"_score": null,
"_source": {
"viewed_timestamp": "20160819T152359",
"content_type": "bp",
"title": "Data print",
"doc_id": "BP003"
},
"sort": [
1471620239000
]
}]
}
}
}, {
"key": "bp004",
"doc_count": 3,
"top_tag_hits": {
"hits": {
"total": 3,
"max_score": null,
"hits": [{
"_index": "visitedData",
"_type": "userdoc",
"_id": "AVak513Y8G",
"_score": null,
"_source": {
"viewed_timestamp": "20160819T152401",
"content_type": "bp",
"title": "Application Print",
"doc_id": "BP004"
},
"sort": [
1471620241000
]
}]
}
}
}]
}
}
}
it is beacuse your view_timestap type is not date, it is timesatmp. you should change this field to date format, such as:
"updateTime": "2017-01-12T21:28:49.562065"
If you're only trying to order by the timestamp, you could try using a max aggregation, like in this example:
https://www.elastic.co/guide/en/elasticsearch/reference/5.6/search-aggregations-metrics-top-hits-aggregation.html#_field_collapse_example

How do I perform an "OR" filter on an aggregate?

I am trying to grab the first 10 documents grouped by domain. These 10 documents need to have a value for "crawl_date" that haven't been crawled for a while or haven't been crawled at all (eg a blank value). I have:
curl -XPOST 'http://localhost:9200/tester/test/_search' -d '
{
"size": 10,
"aggs": {
"group_by_domain": {
"filter": {
"or":[
"term": {"crawl_date": ""},
"term": {"crawl_date": ""} // how do I put a range here? e.g. <= '2014-12-31'
]
},
"terms": {
"field": "domain"
}
}
}
}'
I am new to ES and using version 2.2. Since the documentation isn't fully updated I am struggling.
EDIT:
To clarify, I need 10 urls that haven't been crawled or haven't been crawled for a while. Each of those 10 urls has to come from a unique domain so that when I crawl them I don't overload someone's server.
Another Edit:
So, I need something like this (1 link for each of 10 unique domains):
1. www.domain1.com/page
2. www.domain2.com/url
etc...
Instead, I am getting just the domain and the number of pages:
"buckets": [
{
"key": "http://www.dailymail.co.uk",
"doc_count": 212
},
{
"key": "https://sedo.com",
"doc_count": 196
},
{
"key": "http://www.foxnews.com",
"doc_count": 118
},
{
"key": "http://data.worldbank.org",
"doc_count": 117
},
{
"key": "http://detail.1688.com",
"doc_count": 117
},
{
"key": "https://twitter.com",
"doc_count": 112
},
{
"key": "http://search.rakuten.co.jp",
"doc_count": 104
},
{
"key": "https://in.1688.com",
"doc_count": 92
},
{
"key": "http://www.abc.net.au",
"doc_count": 87
},
{
"key": "http://sport.lemonde.fr",
"doc_count": 85
}
]
The "hits" returns multiple pages for just 1 domain:
"hits": [
{
"_index": "tester",
"_type": "test",
"_id": "http://www.barnesandnoble.com/w/at-the-edge-of-the-orchard-tracy-chevalier/1121908441?ean=9780525953005",
"_score": 1,
"_source": {
"domain": "http://www.barnesandnoble.com",
"crawl_date": "0001-01-01T00:00:00Z"
}
},
{
"_index": "tester",
"_type": "test",
"_id": "http://www.barnesandnoble.com/b/bargain-books/_/N-8qb",
"_score": 1,
"_source": {
"domain": "http://www.barnesandnoble.com",
"crawl_date": "0001-01-01T00:00:00Z"
}
},
etc....
Barnes and Noble will quickly ban my UA if I try to crawl that many domains at the same time.
I need something like this:
1. "http://www.dailymail.co.uk/page/text.html",
2. "https://sedo.com/another/page"
3. "http://www.barnesandnoble.com/b/bargain-books/_/N-8qb"
4. "http://www.starbucks.com/homepage/"
etc.
Using Aggregations
If you want to use aggregations, I'd suggest using the terms aggregations to remove the duplicates from your result set and as sub aggregation, I'd use the top_hits aggregation, which gives you the best hit from the aggregated documents of each domain (per default the score for each document within a domain should be the same.)
Consequently the query will look like that:
POST sites/page/_search
{
"size": 0,
"aggs": {
"filtered_domains": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2016-01-01"
}
}
}
]
}
},
"aggs": {
"domains": {
"terms": {
"field": "domain",
"size": 10
},
"aggs": {
"pages": {
"top_hits": {
"size": 1
}
}
}
}
}
}
}
}
Giving you results like that
"aggregations": {
"filtered_domains": {
"doc_count": 3,
"domains": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "barnesandnoble.com",
"doc_count": 2,
"pages": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "page",
"_id": "barnesandnoble.com/test2.html",
"_score": 1,
"_source": {
"crawl_date": "1982-05-16",
"domain": "barnesandnoble.com"
}
}
]
}
}
},
{
"key": "starbucks.com",
"doc_count": 1,
"pages": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "page",
"_id": "starbucks.com/index.html",
"_score": 1,
"_source": {
"crawl_date": "1982-05-16",
"domain": "starbucks.com"
}
}
]
}
}
}
]
}
}
Using Parent/Child Aggregations
If you can change your index structure, I'd suggest to create an index with either parent/child relationship or nested documents.
If you do so, you can select 10 distinct domains and retrieve one (or more) specific pages of this url.
Let me show you an example with parent/child (if you use sense, you should be able to just copy paste):
First generate the mappings for the documents:
PUT /sites
{
"mappings": {
"domain": {},
"page": {
"_parent": {
"type": "domain"
},
"properties": {
"crawl_date": {
"type": "date"
}
}
}
}
}
Insert some documents
PUT sites/domain/barnesandnoble.com
{}
PUT sites/domain/starbucks.com
{}
PUT sites/domain/dailymail.co.uk
{}
POST /sites/page/_bulk
{ "index": { "_id": "barnesandnoble.com/test.html", "parent": "barnesandnoble.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "barnesandnoble.com/test2.html", "parent": "barnesandnoble.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "starbucks.com/index.html", "parent": "starbucks.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "dailymail.co.uk/index.html", "parent": "dailymail.co.uk" }}
{}
Search for the urls to crawl
POST /sites/domain/_search
{
"query": {
"has_child": {
"type": "page",
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2016-01-01"
}
}
}]
}
}
}
},
"inner_hits": {
"size": 1
}
}
}
}
We do a has_child query on the parent type and therefor receive only distinct urls of the parent type. To get the specific pages, we have to add an inner_hits query, which gives us the child documents leading to the hits in the parent type.
If you set inner_hits size to 1, you get only one page per domain.
You can even add a sorting in the inner_hits query... For example, you can sort by the crawl_date. ;)
The above search gives you the following result:
"hits": [
{
"_index": "sites",
"_type": "domain",
"_id": "starbucks.com",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 1,
"max_score": 1.9664046,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "starbucks.com/index.html",
"_score": 1.9664046,
"_routing": "starbucks.com",
"_parent": "starbucks.com",
"_source": {
"crawl_date": "1982-05-16"
}
}
]
}
}
}
},
{
"_index": "sites",
"_type": "domain",
"_id": "dailymail.co.uk",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 1,
"max_score": 1.9664046,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "dailymail.co.uk/index.html",
"_score": 1.9664046,
"_routing": "dailymail.co.uk",
"_parent": "dailymail.co.uk",
"_source": {}
}
]
}
}
}
},
{
"_index": "sites",
"_type": "domain",
"_id": "barnesandnoble.com",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 2,
"max_score": 1.4142135,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "barnesandnoble.com/test.html",
"_score": 1.4142135,
"_routing": "barnesandnoble.com",
"_parent": "barnesandnoble.com",
"_source": {
"crawl_date": "1982-05-16"
}
}
]
}
}
}
}
]
Finally, let me note one thing. Parent/child relationship comes with small costs at query time. If this isn't a problem for your use case, I'd go for this solution.
I suggest you use the exists filter instead of trying to match an empty term (the missing filter is deprecated in 2.2). Then, the range filter will help you filter out the documents you don't need.
Finally, since you have used the absolute URL as id, make sure to aggregate on the _uid field and not the domain field, that way you'll get unique counts per exact page.
curl -XPOST 'http://localhost:9200/tester/test/_search' -d '{
"size": 10,
"aggs": {
"group_by_domain": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2014-12-31T00:00:00.000"
}
}
}
]
}
},
"aggs": {
"domains": {
"terms": {
"field": "_uid"
}
}
}
}
}
}'
You have to use Filter Aggregation and then sub-aggregation
{
"size": 10,
"aggs": {
"filter_date": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": [
{
"exists": {
"field": "crawl_date"
}
}
]
}
},
{
"range": {
"crawl_date": {
"from": "now-100d"
}
}
}
]
}
},
"aggs": {
"group_by_domain": {
"terms": {
"field": "domain"
}
}
}
}
}
}

Resources