Elasticseach bulk load strangely missing 1 of 3 documents - elasticsearch

I'm running into strange problems following the shingles example at https://www.elastic.co/guide/en/elasticsearch/guide/current/shingles.html
When I try to index the three documents on that tutorial, only two of them get indexed, the document with ID 3 is never indexed.
The request POSTed to http://elastic:9200/myIndex/page/_bulk is:
{ "index": { "_id": 1 }}
{ "text": "Sue ate the alligator" }
{ "index": { "_id": 2 }}
{ "text": "The alligator ate Sue" }
{ "index": { "_id": 3 }}
{ "text": "Sue never goes anywhere without her alligator skin purse" }
But the response is:
{
"took": 18,
"errors": false,
"items": [
{
"index": {
"_index": "myIndex",
"_type": "page",
"_id": "1",
"_version": 1,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"status": 201
}
},
{
"index": {
"_index": "myIndex",
"_type": "page",
"_id": "2",
"_version": 1,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"status": 201
}
}
]}
Index and mappings definition:
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"filter_shingle": {
"type": "shingle",
"max_shingle_size": 5,
"min_shingle_size": 2,
"output_unigrams": "false"
},
"filter_stop": {
"type": "stop"
}
},
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": ["standard", "lowercase", "filter_stop", "filter_shingle"]
}
}
}
},
"mappings": {
"page": {
"properties": {
"text": {
"type": "string",
"index_options": "offsets",
"analyzer": "standard",
"fields": {
"shingles": {
"search_analyzer": "analyzer_shingle",
"analyzer": "analyzer_shingle",
"type": "string"
}
}
},
"title": {
"type": "string",
"index_options": "offsets",
"analyzer": "standard",
"search_analyzer": "standard"
}
}
}
}}

When posting documents in bulk, you need to make sure to include a new line character after the last line as explained in the official docs
curl -XPOST http://elastic:9200/myIndex/page/_bulk -d '
{ "index": { "_id": 1 }}
{ "text": "Sue ate the alligator" }
{ "index": { "_id": 2 }}
{ "text": "The alligator ate Sue" }
{ "index": { "_id": 3 }}
{ "text": "Sue never goes anywhere without her alligator skin purse" }
' <--- new line

Related

Keyword normalizer not applied on document

I'm using Elasticsearch 6.8
here is my mapping
{
"index_patterns": [
"my_index_*"
],
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 1
},
"analysis": {
"analyzer": {
"lower_ascii_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"audit_conformity": {
"dynamic": "false",
"properties": {
"country": {
"type": "keyword",
"normalizer": "my_normalizer"
},
[…]
Then I post a document with this body
{
"_source": {
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "MX",
"user_entity_id": "1"
}
}
When I search for the document, the country is still capitalized
GET /my_index_country/_search
I get
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "my_index_country",
"_type": "my_index",
"_id": "LOT0fYIBCNP9gFG_7cet",
"_score": 1,
"_source": {
"_source": {
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "MX",
"user_entity_id": "1",
}
}
}
]
}
}
What do I do wrong ?
You do nothing wrong, but normalizers (and analyzer alike) will never modify your source document, only whatever is indexed from it.
This means that the source document keeps holding MX but underneath mx will be indexed for the country field.
If you want to lowercase the country field, you should use an ingest pipeline with a lowercase processor instead which will modify your source document before indexing it:
PUT _ingest/pipeline/lowercase-pipiline
{
"processors": [
{
"lowercase": {
"field": "country"
}
}
]
}
Then use it when indexing your documents:
PUT my_index_country/my_index/LOT0fYIBCNP9gFG_7cet?pipeline=lowercase-pipeline
{
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "MX",
"user_entity_id": "1",
}
GET my_index_country/my_index/LOT0fYIBCNP9gFG_7cet
Result =>
{
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "mx",
"user_entity_id": "1",
}

Searching with comma-seperated string in ElasticSearch Query, Indexing

I'm trying to search with a string, containing multiple strings that are comma-separated. [might not match with the whole value text, can be partial, the passed item should be in the text]
Note: I have tried n-gram as well, which does not give me right data.
(Example: search term "Data Science" gives all "Data", "Science", "data science")
Doc In ES:
{
"_index": "questions_dev",
"_type": "_doc",
"_id": "188",
"_score": 6.6311107,
"_source": {
"questionId": 188,
"questionText": "What other social media platforms do you use on your own time?",
"domainId": 2,
"subdomainId": 25,
"type": "TEXT",
"difficulty": 1,
"time": 600,
"domain": "Domain Specific",
"subdomain": "Social Media Specialist",
"skill": ["social media"]
}
}
What I have done so far:
Index:
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"default": {
"tokenizer": "custom_tokenizer",
"filter": ["lowercase"]
}
},
"tokenizer": {
"custom_tokenizer": {
"type": "pattern",
"pattern": ",",
},
}
}
},
"mappings": {
"properties": {
"questionId": {
"type": "long"
},
"questionText": {
"type": "text",
},
"domain": {
"type": "text"
},
"subdomain": {
"type": "text"
},
"type":{
"type": "keyword"
},
"difficulty":{
"type": "keyword"
},
"totaltime":{
"type": "keyword"
},
"domainId":{
"type": "keyword"
},
"subdomainId":{
"type": "keyword"
}
}
}
}
Query:
{
"query": {
"bool": {
"should": [
{
"multi_match": {
"fields": ["questionText","skill"],
"query": "social media"
}
}
]
}
}
}
Output:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 0,
"relation": "eq"
},
"max_score": null,
"hits": []
}
}
Expected Output:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 6.6311107,
"hits": [
{
"_index": "questions_development",
"_type": "_doc",
"_id": "188",
"_score": 6.6311107,
"_source": {
"questionId": 188,
"questionText": "What other social media platforms do you use on your own time?",
"domainId": 2,
"subdomainId": 25,
"type": "TEXT",
"difficulty": 1,
"time": 600,
"domain": "Domain Specific",
"subdomain": "Social Media Specialist",
"skill": []
}
}
]
}
}
Goal:
Search with a string for all the docs, which contains the string.
Example:
If I search with "social media" it should return me the above doc.
(for my case its not returning.)
this search also should support a comma-separated search mechanism.
which means, I can pass "social media, own time" and I'm expecting the output to have questionTexts text to contain any of these strings.
The data which you are indexing social media, own time, contains whitespace between , and own time. So, the tokens generated with your previous mapping are :
{
"tokens": [
{
"token": " social media", <-- note the preceding whitespace here
"start_offset": 0,
"end_offset": 12,
"type": "word",
"position": 0
},
{
"token": " own time", <-- note the preceding whitespace here
"start_offset": 13,
"end_offset": 22,
"type": "word",
"position": 1
}
]
}
Therefore, in the search query when you "query": "social media", with no whitespace, in the beginning, no search results are shown. However, if you query in this way "query": " social media" (including whitespace in the beginning), then search result will be there.
To remove leading and trailing whitespace from each token in a stream you can use Trim Token filter
Adding working example with index data, mapping and search query
Index Mapping:
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"default": {
"tokenizer": "custom_tokenizer",
"filter": [
"lowercase",
"trim" <-- note this
]
}
},
"tokenizer": {
"custom_tokenizer": {
"type": "pattern",
"pattern": ",",
"filter": [
"trim" <-- note this
]
}
}
}
},
"mappings": {
"properties": {
"questionText": {
"type": "text"
}
}
}
}
Index Data:
{ "questionText": "social media" }
{ "questionText": "social media, own time" }
Search Query:
{
"query": {
"bool": {
"should": [
{
"multi_match": {
"fields": [
"questionText"
],
"query": "own time" <-- no whitespace included in the
beginning
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "my-index",
"_type": "_doc",
"_id": "2",
"_score": 0.60996956,
"_source": {
"questionText": "social media, own time"
}
}
Update 1:
Index Settings
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "pattern",
"pattern": ","
}
}
}
}
}
Index Data:
{
"questionText": "What other platforms do you use on your ?"
}
{
"questionText": "What other social time platforms do you use on your?"
}
{
"questionText": "What other social media platforms do you use on your?"
}
{
"questionText": "What other platforms do you use on your own time?"
}
Search Query:
{
"query": {
"bool": {
"should": [
{
"multi_match": {
"fields": "questionText",
"query": "social media, own time"
}
}
]
}
}
}
Search Result
"hits": [
{
"_index": "my-index3",
"_type": "_doc",
"_id": "1",
"_score": 2.5628972,
"_source": {
"questionText": "What other social media platforms do you use on your own time?"
}
},
{
"_index": "my-index3",
"_type": "_doc",
"_id": "2",
"_score": 1.3862944,
"_source": {
"questionText": "What other social media platforms do you use on your?"
}
},
{
"_index": "my-index3",
"_type": "_doc",
"_id": "3",
"_score": 1.3862944,
"_source": {
"questionText": "What other platforms do you use on your own time?"
}
}
]

Search across _all field in Elastic and return results with highlighting

I am using Elastic 5.4 and wanted to query across index containing documents of multiple types.(type a and type b). Below are example documents in the index:
Documents:
{
"_index": "test",
"_type": "a",
"_id": "1",
"_source": {
"id": "1",
"name": "john-usa-soccer",
"class": "5",
"lastseen": "2017-07-05",
"a_atts": {
"lastname": "tover",
"hobby": "soccer",
"country": "usa"
}
}
}
{
"_index": "test",
"_type": "b",
"_id": "2",
"_source": {
"id": "2",
"name": "john-usa",
"class": "5",
"lastseen": "2017-07-05",
"b_atts": {
"lastname": "kaml",
"hobby": "baseball",
"country": "usa"
}
}
}
Mapping:
{
"settings": {
"analysis": {
"analyzer": {
"my_ngram_analyzer": {
"tokenizer": "my_ngram_tokenizer"
}
},
"tokenizer": {
"my_ngram_tokenizer": {
"type": "ngram",
"min_gram": "3",
"max_gram": "3",
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings": {
"a": {
"dynamic_templates": [
{
"strings": {
"match": "*",
"match_mapping_type": "string",
"mapping": {
"type": "text",
"analyzer": "my_ngram_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"suggest": {
"type": "completion",
"analyzer": "simple"
},
"analyzer1": {
"type": "text",
"analyzer": "simple"
},
"analyzer2": {
"type": "text",
"analyzer": "standard"
}
}
}
}
}
]
},
"b": {
"dynamic_templates": [
{
"strings": {
"match": "*",
"match_mapping_type": "string",
"mapping": {
"type": "text",
"analyzer": "my_ngram_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"suggest": {
"type": "completion",
"analyzer": "simple"
},
"analyzer1": {
"type": "text",
"analyzer": "simple"
},
"analyzer2": {
"type": "text",
"analyzer": "standard"
}
}
}
}
}
]
}
}
}
My query is to search all documents which contain 'john' across any of the fields in any type and highlight the fields where the match was found. This query is constructed as per Elastic documentation. My Schema mappings has ngram_analyzer configured as analyzer instead of default analyzer for all fields of type string in the schema.
Query: http://localhost:9200/student/_search
{
"query": {
"bool": {
"should": [
{ "match": { "_all": "john"} }
]
}
},
"highlight": {
"fields": {
"name": {
"require_field_match": false
},
"a_atts.lastname":{
"require_field_match": false
},
"a_atts.hobby":{
"require_field_match": false
},
"a_atts.country":{
"require_field_match": false
}
}
}
}
Response:
{
"took": 79,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.17669111,
"hits": [
{
"_index": "student",
"_type": "a",
"_id": "AV1WjBeYEZrDBYsdGMtY",
"_score": 0.17669111,
"_source": {
"name": "john-usa-soccer",
"class": "5",
"lastseen": "2017-07-05",
"a_atts": {
"lastname": "tover",
"hobby": "soccer",
"country": "usa"
}
}
},
{
"_index": "student",
"_type": "b",
"_id": "AV1WjHFxEZrDBYsdGMtZ",
"_score": 0.17669111,
"_source": {
"name": "john-usa",
"class": "5",
"lastseen": "2017-07-05",
"b_atts": {
"lastname": "kaml",
"hobby": "baseball",
"country": "usa"
}
}
}
]
}
}
However, executing the above query against an index, returns documents matched with their _source content but not highlight field. It is missing the following:
"highlight": {
"name": [
"<em>john</em>-usa-soccer"
]
}
How can I return highlight in the results?
I got highlighter to work by following the answer provided in this link.
"highlight": {
"fields": {
"*": {}
},
"require_field_match": false
}

Synonym analyzer not working

Here are my settings:
{
"countries": {
"aliases": {},
"mappings": {
"country": {
"properties": {
"countryName": {
"type": "string"
}
}
}
},
"settings": {
"index": {
"creation_date": "1472140045116",
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
}
},
"analyzer": {
"synonym": {
"filter": [
"synonym"
],
"tokenizer": "whitespace"
}
}
},
"number_of_shards": "5",
"number_of_replicas": "1",
"uuid": "7-fKyD9aR2eG3BwUNdadXA",
"version": {
"created": "2030599"
}
}
},
"warmers": {}
}
}
My synonym.txt file is in the config folder inside the main elasticsearch folder.
Here is my query:
query: {
query_string: {
fields: ["countryName"],
default_operator: "AND",
query: searchInput,
analyzer: "synonym"
}
}
The words in synonym.txt are: us, u.s., united states.
So this doesn't work. What's interesting is that search works as normal, except for when I enter any of the words in the synonym.txt file. So for example, when I usually type in us into the search, I would get results. With this analyzer, us doesn't give me anything.
I've done close and open to my ES server, and still it doesn't work.
EDIT
An example of a document:
{
"_index": "countries",
"_type": "country",
"_id": "57aabeb80057405968de152b",
"_score": 1,
"_source": {
"countryName": "United States"
}
Example of searchInput (this is coming from the front-end):
united states
EDIT #2:
Here is my updated index config file:
{
"countries": {
"aliases": {},
"mappings": {},
"settings": {
"index": {
"number_of_shards": "5",
"creation_date": "1472219634083",
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
}
},
"analyzer": {
"synonym": {
"filter": [
"synonym"
],
"tokenizer": "whitespace"
}
}
},
"country": {
"properties": {
"countryName": {
"type": "string",
"analyzer": "synonym"
},
"number_of_replicas": "1",
"uuid": "50ZwpIVFTqeD_rJxlmd59Q",
"version": {
"created": "2030599"
}
}
},
"warmers": {}
}
}
}
}
When I try adding documents, and doing a search on said documents, the synonym analyzer does not work for me.
EDIT #3
Here are 2 documents in the index:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [{
"_index": "stocks",
"_type": "stock",
"_id": "2",
"_score": 1,
"_source": {
"countryName": "United States"
}
}, {
"_index": "stocks",
"_type": "stock",
"_id": "1",
"_score": 1,
"_source": {
"countryName": "Canada"
}
}]
}
}
You are close, but I suggest reading thoroughly this section from the documentation to understand better this functionality.
As a solution:
PUT /countries
{
"mappings": {
"country": {
"properties": {
"countryName": {
"type": "string",
"analyzer": "synonym"
}
}
}
},
"settings": {
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
}
},
"analyzer": {
"synonym": {
"filter": [
"lowercase",
"synonym"
],
"tokenizer": "whitespace"
}
}
}
}
}
You need to delete the index and create it again with the mapping above.
Then use this query:
"query": {
"query_string": {
"fields": [
"countryName"
],
"default_operator": "AND",
"query": "united states"
}
}
Have you deleted/created the index after pushing the txt ?
I think you should remove the "synonyms": "" if you are using "synonyms_path"

edge_ngram filter and not analzyed to match search

I have the following elastic search configuration:
PUT /my_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
},
"snow_filter" : {
"type" : "snowball",
"language" : "English"
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"snow_filter",
"autocomplete_filter"
]
}
}
}
}
}
PUT /my_index/_mapping/my_type
{
"my_type": {
"properties": {
"name": {
"type": "multi_field",
"fields": {
"name": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "snowball"
},
"not": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
POST /my_index/my_type/_bulk
{ "index": { "_id": 1 }}
{ "name": "Brown foxes" }
{ "index": { "_id": 2 }}
{ "name": "Yellow furballs" }
{ "index": { "_id": 3 }}
{ "name": "my discovery" }
{ "index": { "_id": 4 }}
{ "name": "myself is fun" }
{ "index": { "_id": 5 }}
{ "name": ["foxy", "foo"] }
{ "index": { "_id": 6 }}
{ "name": ["foo bar", "baz"] }
I am trying to get a search to only return item 6 that has a name of "foo bar" and I am not quite sure how. This is what I am doing right now:
GET /my_index/my_type/_search
{
"query": {
"match": {
"name": {
"query": "foo b"
}
}
}
}
I know it's a combination of how the tokenizer is splitting the word but sort of lost on how both be flexible and be strict enough to match this. I am guessing I need to do a multiple field on my mapping of name, but I am not sure. How can I fix the query and/or my mapping to satisfy my needs?
You're already close. Since your edge_ngram analyzer generates tokens of a minimum length of 1, and your query gets tokenized into "foo" and "b", and the default match query operator is "or", your query matches each document that has a term starting with "b" (or "foo"), three of the docs.
Using the "and" operator seems to do what you want:
POST /my_index/my_type/_search
{
"query": {
"match": {
"name": {
"query": "foo b",
"operator": "and"
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.4451914,
"hits": [
{
"_index": "test_index",
"_type": "my_type",
"_id": "6",
"_score": 1.4451914,
"_source": {
"name": [
"foo bar",
"baz"
]
}
}
]
}
}
Here's the code I used to test it:
http://sense.qbox.io/gist/4f6fb7c1fdc6942023091ee1433d7490e04e7dea

Resources