Elasticsearch ngram tokenizer returns all results regardless of query input - elasticsearch

I am trying to build a query to search for records in the following format: TR000002_1_2020.
Users should be able to search for results the following ways:
TR000002 or 2_1_2020 or TR000002_1_2020 or 2020. I figured an ngram tokenization query would be best suited for my needs. I am using Elasticsearch 6.8 so I cannot use the built in Search-As-You-Type introduced in E7.
Here's my implementation I followed from docs here. The only thing I modified was EdgeNGram -> NGram as the user can search from any point of the text.
My Analysis block looks like this:
.Analysis(a => a
.Analyzers(aa => aa
.Custom("autocomplete", ca => ca
.Tokenizer("autocomplete")
.Filters(new string[] {
"lowercase"
})
)
.Custom("autocomplete_search", ca => ca
.Tokenizer("lowercase")
)
)
.Tokenizers(t => t
.NGram("autocomplete", e => e
.MinGram(2)
.MaxGram(16)
.TokenChars(new TokenChar[] {
TokenChar.Letter,
TokenChar.Digit,
TokenChar.Punctuation,
TokenChar.Symbol
})
)
)
)
Then in my mapping I define:
.Text(t => t
.Name(tr => tr.TestRecordId)
.Analyzer("autocomplete")
.SearchAnalyzer("autocomplete_search")
)
When I search for TR000002, my query returns all results instead of just the records that contain those specific characters. What am I doing wrong? Is there a better tokenizer for this specific use case? Thanks!
EDIT: Here's a sample of what is returned:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 27,
"max_score" : 0.105360515,
"hits" : [
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "3",
"_score" : 0.105360515,
"_source" : {
"id" : 3,
"testRecordId" : "TR000002_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 12,
"testStatus" : {
"testStatusId" : 12,
"name" : "Complete: Postponed Until Further Notice"
},
"discriminator" : "SingleEventEffectsRecord",
"testRecordServiceOrders" : [
{
"testRecordId" : 3,
"serviceOrderId" : 9,
"serviceOrder" : {
"serviceOrderId" : 9,
"serviceOrderNumber" : "105702"
}
}
],
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
},
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false,
"sEETestRates" : [ ]
}
},
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "11",
"_score" : 0.105360515,
"_source" : {
"id" : 11,
"testRecordId" : "TR000011_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 1,
"testStatus" : {
"testStatusId" : 1,
"name" : "Active"
},
"discriminator" : "TotalIonizingDoseRecord",
"creatorUsername" : "[REDACTED]",
"creator" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"testRecordServiceOrders" : [ ],
"partLDC" : "12",
"waferLot" : "1",
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false,
"testStartDate" : "2020-07-30T00:00:00",
"actualCompletionDate" : "2020-07-31T00:00:00"
}
},
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "17",
"_score" : 0.105360515,
"_source" : {
"id" : 17,
"testRecordId" : "TR000017_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "lewallen",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 1,
"testStatus" : {
"testStatusId" : 1,
"name" : "Active"
},
"discriminator" : "TotalIonizingDoseRecord",
"creatorUsername" : "[REDACTED]",
"creator" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"testRecordServiceOrders" : [ ],
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false
}
},
Also here's what shows for mapping:
"testRecordId" : {
"type" : "text",
"analyzer" : "autocomplete",
"search_analyzer" : "autocomplete_search"
},
I guess I should also mention, I've been testing this query in the console like so:
GET test-records-development/_search
{
"query": {
"match": {
"testRecordId": {
"query": "TR000002_1_2020"
}
}
}
}
EDIT 2: Added API response from index _settings endpoint:
{
"test-records-development-09-09-2020-02-00-00" : {
"settings" : {
"index" : {
"number_of_shards" : "5",
"provided_name" : "test-records-development-09-09-2020-02-00-00",
"creation_date" : "1599617013874",
"analysis" : {
"analyzer" : {
"autocomplete" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "autocomplete"
},
"autocomplete_search" : {
"type" : "custom",
"tokenizer" : "lowercase"
}
},
"tokenizer" : {
"autocomplete" : {
"token_chars" : [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram" : "2",
"type" : "ngram",
"max_gram" : "16"
}
}
},
"number_of_replicas" : "0",
"uuid" : "FSeCa0YwRCOJVbjfxYGkig",
"version" : {
"created" : "6080199"
}
}
}
}
}

As I don't have the analyzer setting access in JSON format,I can't confirm it but most probably issue is with your search analyzer autocomplete_search which is creating search time tokens which are matching the index time tokens.
For example: you are searching for TR000002_1_2020 and if it creates 2020 as a token and for document containing TR000011_1_2020 also creates a 2020 token than your query will match it.
You can use the analyze API to check the generated tokens based on a analyzer and as mentioned earlier mostly there is some tokens which are matching as shown above.

Related

Elastic search match phrase query with single token

So I am trying to understand how the match_phrase query works under certain circumstances
with elastic search [We have version 6.8 set up as of now ] . When I give it a string with multiple tokens it shows while profiling its running a phrase query but when I run it with a single token while profiling it shows its running a termsquery internally . I am trying to understand shouldn't it be independent of the input and if the positioning of terms is not correct fail to return a match ? Attaching queries and o/p -
Query with multiple tokens -
GET potato_testv3/_search
{"profile": "true",
"query": {
"bool": {
"must": [
{ "match_phrase": { "skill_set": {"query":"potato farmer"} }}
]
}
}
}
Output of the above -
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 0.5753642,
"hits" : [
{
"_index" : "potato_testv3",
"_type" : "recruiterinsightsv11",
"_id" : "4RShdnkBc8OOeUFVkncD",
"_score" : 0.5753642,
"_source" : {
"skill_set" : [
"silly webdriver",
"uft",
"uft/qtp",
"potato farmer"
]
}
}
]
},
"profile" : {
"shards" : [
{
"id" : "[5QVxJbTCSU-ruYT9EHsujA][potato_testv3][0]",
"searches" : [
{
"query" : [
{
"type" : "PhraseQuery",
"description" : """skill_set:"potato farmer"""",
"time_in_nanos" : 338986,
"breakdown" : {
"score" : 15362,
"build_scorer_count" : 2,
"match_count" : 1,
"create_weight" : 55661,
"next_doc" : 74248,
"match" : 39624,
"create_weight_count" : 1,
"next_doc_count" : 2,
"score_count" : 1,
"build_scorer" : 154084,
"advance" : 0,
"advance_count" : 0
}
}
],
"rewrite_time" : 3932,
"collector" : [
{
"name" : "CancellableCollector",
"reason" : "search_cancelled",
"time_in_nanos" : 48431,
"children" : [
{
"name" : "SimpleTopScoreDocCollector",
"reason" : "search_top_hits",
"time_in_nanos" : 19840
}
]
}
]
}
],
"aggregations" : [ ]
}
]
}
}
Query with single token -
GET potato_testv3/_search
{"profile": "true",
"query": {
"bool": {
"must": [
{ "match_phrase": { "skill_set": {"query":"potato"} }}
]
}
}
}
Output of above -
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 0.2876821,
"hits" : [
{
"_index" : "potato_testv3",
"_type" : "recruiterinsightsv11",
"_id" : "4RShdnkBc8OOeUFVkncD",
"_score" : 0.2876821,
"_source" : {
"skill_set" : [
"silly webdriver",
"uft",
"uft/qtp",
"potato farmer"
]
}
}
]
},
"profile" : {
"shards" : [
{
"id" : "[TeKxvYLJQfG_GVtD3bmpiw][potato_testv3][0]",
"searches" : [
{
"query" : [
{
"type" : "TermQuery",
"description" : "skill_set:potato",
"time_in_nanos" : 52214,
"breakdown" : {
"score" : 11310,
"build_scorer_count" : 2,
"match_count" : 0,
"create_weight" : 30974,
"next_doc" : 1314,
"match" : 0,
"create_weight_count" : 1,
"next_doc_count" : 2,
"score_count" : 1,
"build_scorer" : 8610,
"advance" : 0,
"advance_count" : 0
}
}
],
"rewrite_time" : 3761,
"collector" : [
{
"name" : "CancellableCollector",
"reason" : "search_cancelled",
"time_in_nanos" : 20912,
"children" : [
{
"name" : "SimpleTopScoreDocCollector",
"reason" : "search_top_hits",
"time_in_nanos" : 15758
}
]
}
]
}
],
"aggregations" : [ ]
}
]
}
}
In case if it helps , schema of the index used -
{
"potato_testv3" : {
"mappings" : {
"recruiterinsightsv11" : {
"dynamic" : "false",
"properties" : {
"skill_set" : {
"type" : "text",
"norms" : false,
"fielddata" : true
}
}
}
}
}
}
You are executing the same match_phrase query, once with a search string made up of multiple terms, once with a search string of a single token.
When executing an Elasticsearch query, Elasticsearch will optimise the query and translate it to the relevant queries on Lucene level. A phrase-query is more expensive to execute as
all terms of the search string need to match, and on top of that
the positions of the terms in a matching document need to be in the very same order as in the search string
If your search string only consist of a single term Elasticsearch can skip all of that extra effort and simply query for documents matching that single search term. What you observe therefore, is making perfect sense. It shows you how Elasticsearch is optimising the query while executing it.

Elasticsearch cluster NodeStatsCollector time out when collecting data

A few minutes after the elasticsearch cluster run, all nodes gives the error:
[ERROR][o.e.x.m.c.n.NodeStatsCollector] [node-1] collector [node_stats] timed out when collecting data
[ERROR][o.e.x.m.c.n.NodeStatsCollector] [node-1] collector [node_stats] timed out when collecting data
all nodes gives this error and indexer that is a java program index data whit bulk api very well. and there is no error in logs of my indexer program. but elastic search node gives this error.
i used 4 elasticsearch nodes and kibana in one machine whit windows server 2019 data center
the version of elasticsearch and kibana is 7.10.0 and Cluster Status is :
{
"_nodes" : {
"total" : 4,
"successful" : 4,
"failed" : 0
},
"cluster_name" : "es-cluster",
"cluster_uuid" : "RRhGhaElfh5lUxGfsKg",
"timestamp" : 1245375859907,
"status" : "green",
"indices" : {
"count" : 1,
"shards" : {
"total" : 9,
"primaries" : 3,
"replication" : 2.0,
"index" : {
"shards" : {
"min" : 9,
"max" : 9,
"avg" : 9.0
},
"primaries" : {
"min" : 3,
"max" : 3,
"avg" : 3.0
},
"replication" : {
"min" : 2.0,
"max" : 2.0,
"avg" : 2.0
}
}
},
"docs" : {
"count" : 0,
"deleted" : 0
},
"store" : {
"size" : "36.1mb",
"size_in_bytes" : 37936718,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"total_count" : 0,
"hit_count" : 0,
"miss_count" : 0,
"cache_size" : 0,
"cache_count" : 0,
"evictions" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 0,
"memory" : "0b",
"memory_in_bytes" : 0,
"terms_memory" : "0b",
"terms_memory_in_bytes" : 0,
"stored_fields_memory" : "0b",
"stored_fields_memory_in_bytes" : 0,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "0b",
"norms_memory_in_bytes" : 0,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "0b",
"doc_values_memory_in_bytes" : 0,
"index_writer_memory" : "233.1mb",
"index_writer_memory_in_bytes" : 244509188,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0,
"fixed_bit_set" : "0b",
"fixed_bit_set_memory_in_bytes" : 0,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "boolean",
"count" : 3,
"index_count" : 1
},
{
"name" : "date",
"count" : 4,
"index_count" : 1
},
{
"name" : "geo_point",
"count" : 1,
"index_count" : 1
},
{
"name" : "integer",
"count" : 8,
"index_count" : 1
},
{
"name" : "ip",
"count" : 2,
"index_count" : 1
},
{
"name" : "keyword",
"count" : 12,
"index_count" : 1
},
{
"name" : "object",
"count" : 1,
"index_count" : 1
},
{
"name" : "text",
"count" : 17,
"index_count" : 1
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [
{
"name" : "stop",
"count" : 3,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [
{
"name" : "standard",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "arabic_normalization",
"count" : 1,
"index_count" : 1
},
{
"name" : "decimal_digit",
"count" : 1,
"index_count" : 1
},
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "persian_normalization",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" : [ ]
}
},
"nodes" : {
"count" : {
"total" : 4,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 0,
"remote_cluster_client" : 4,
"transform" : 3,
"voting_only" : 1
},
"versions" : [
"7.10.0"
],
"os" : {
"available_processors" : 48,
"allocated_processors" : 24,
"names" : [
{
"name" : "Windows Server 2019",
"count" : 4
}
],
"pretty_names" : [
{
"pretty_name" : "Windows Server 2019",
"count" : 4
}
],
"mem" : {
"total" : "383.4gb",
"total_in_bytes" : 411772076032,
"free" : "127.2gb",
"free_in_bytes" : 136611741696,
"used" : "256.2gb",
"used_in_bytes" : 275160334336,
"free_percent" : 33,
"used_percent" : 67
}
},
"process" : {
"cpu" : {
"percent" : 0
},
"open_file_descriptors" : {
"min" : -1,
"max" : -1,
"avg" : 0
}
},
"jvm" : {
"max_uptime" : "38.2m",
"max_uptime_in_millis" : 2297261,
"versions" : [
{
"version" : "14.0.2",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "14.0.2+12-46",
"vm_vendor" : "Oracle Corporation",
"bundled_jdk" : true,
"using_bundled_jdk" : false,
"count" : 4
}
],
"mem" : {
"heap_used" : "2.2gb",
"heap_used_in_bytes" : 2433056080,
"heap_max" : "50gb",
"heap_max_in_bytes" : 53687091200
},
"threads" : 153
},
"fs" : {
"total" : "6.5tb",
"total_in_bytes" : 7196607758336,
"free" : "6.2tb",
"free_in_bytes" : 6888031485952,
"available" : "6.2tb",
"available_in_bytes" : 6888031469568
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 4
},
"http_types" : {
"security4" : 4
}
},
"discovery_types" : {
"zen" : 4
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "zip",
"count" : 4
}
],
"ingest" : {
"number_of_pipelines" : 2,
"processor_stats" : {
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
Any Idea for solve this error?

Elasticsearch wildcard query with spaces

I'm trying to do a wildcard query with spaces. It easily matches the words on term basis but not on field basis.
I've read the documentation which says that I need to have the field as not_analyzed but with this type set, it returns nothing.
This is the mapping with which it works on term basis:
{
"denshop" : {
"mappings" : {
"products" : {
"properties" : {
"code" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string"
},
"price" : {
"type" : "long"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
This is the mapping with which the exact same query returns nothing:
{
"denshop" : {
"mappings" : {
"products" : {
"properties" : {
"code" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string",
"index" : "not_analyzed"
},
"price" : {
"type" : "long"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
The query is here:
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*test*"}}}'
Response with the not_analyzed property:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
Response without not_analyzed:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 5,
"max_score" : 1.0,
"hits" : [ {
...
EDIT: Adding requested info
Here is the list of documents:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 5,
"max_score" : 1.0,
"hits" : [ {
"_index" : "denshop",
"_type" : "products",
"_id" : "3L1",
"_score" : 1.0,
"_source" : {
"id" : 3,
"name" : "Testovací produkt 2",
"code" : "",
"price" : 500,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-2/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "4L1",
"_score" : 1.0,
"_source" : {
"id" : 4,
"name" : "Testovací produkt 3",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-3/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "2L1",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "Testovací produkt",
"code" : "",
"price" : 500,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "5L1",
"_score" : 1.0,
"_source" : {
"id" : 5,
"name" : "Testovací produkt 4",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-4/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "6L1",
"_score" : 1.0,
"_source" : {
"id" : 6,
"name" : "Testovací produkt 5",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/tricka-tilka-tuniky/testovaci-produkt-5/"
}
} ]
}
}
Without the not_analyzed it returns with this:
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*testovací*"}}}'
But not with this (notice the space before asterisk):
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*testovací *"}}}'
When I add the not_analyzed to mapping, it returns no hits no matter what I put in the wildcard query.
Add a custom analyzer that should lowercase the text. Then in your search query, before passing the text to it have it lowercased in your client application.
To, also, keep the original analysis chain, I've added a sub-field to your name field that will use the custom analyzer.
PUT /denshop
{
"settings": {
"analysis": {
"analyzer": {
"keyword_lowercase": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"products": {
"properties": {
"name": {
"type": "string",
"fields": {
"lowercase": {
"type": "string",
"analyzer": "keyword_lowercase"
}
}
}
}
}
}
}
And the query will work on the sub-field:
GET /denshop/products/_search
{
"query": {
"wildcard": {
"name.lowercase": "*testovací *"
}
}
}

Using mongodb $lookup on big documents is slow

I have users_users document with 966.628 entries and orders_orders with 1.419.081 (and above 14.000.000 entries inside orders).
I need to retrieve user's data based on multiple filters (location, birthday, nb orders, nb products bought, etc.) but it never ends. I'm new with mongodb so I probably do bad things and need to learn.
db.users_users.aggregate([{
$match: {
locale: {
$in: ["fr_FR", "fr_BE"]
},
"users_addresses.country_iso2": "FR",
mobile: {
$ne: null
}
}
}, {
$lookup: {
from: "orders_orders",
localField: "_id",
foreignField: "id_user",
as: "orders"
}
}, {
$unwind: "$orders"
}, {
$group: {
"_id": "$_id",
"lastname": {
$first: "$lastname"
},
"firstname": {
$first: "$firstname"
},
"email": {
$first: "$email"
},
"date_birth": {
$first: "$date_birth"
},
"locale": {
$first: "$locale"
},
"nb_orders": {
$sum: 1
},
"order_total": {
$sum: "$orders.tax_inclusive_amount"
},
"last_order": {
$max: "$orders.date_creation"
},
"entries": {
$push: "$orders.entries"
},
"countries": {
$addToSet: "$users_addresses.id_country"
},
}
}, {
$unwind: "$entries"
}, {
$unwind: "$entries"
}, {
$group: {
"_id": "$_id",
"lastname": {
$first: "$lastname"
},
"firstname": {
$first: "$firstname"
},
"email": {
$first: "$email"
},
"date_birth": {
$first: "$date_birth"
},
"locale": {
$first: "$locale"
},
"nb_orders": {
$first: "$nb_orders"
},
"order_total": {
$first: "$order_total"
},
"last_order": {
$first: "$last_order"
},
"countries": {
$first: "$countries"
},
"nb_entries": {
$sum: 1
}
}
}, {
$match: {
nb_orders: {
$gt: 1
},
nb_entries: {
$gt: 10
}
}
}])
EDIT:
Indexes, documents and output as requested
users_users indexes
> db.users_users.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"pre_mongified_id" : 1
},
"name" : "pre_mongified_id_1",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"email" : 1
},
"name" : "email_1",
"ns" : "elf.users_users",
"background" : true
},
{
"v" : 1,
"key" : {
"date_birth" : 1
},
"name" : "date_birth_1",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"mobile" : 1
},
"name" : "mobile_1",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"locale" : 1
},
"name" : "locale_1",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"users_addresses.postal_code" : 1
},
"name" : "users_addresses.postal_code_1",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"users_addresses.city" : 1
},
"name" : "users_addresses.city_1",
"ns" : "elf.users_users"
},
{
"v" : 1,
"key" : {
"users_addresses.country_iso2" : 1
},
"name" : "users_addresses.country_iso2_1",
"ns" : "elf.users_users"
}
]
orders_orders indexes
> db.orders_orders.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "elf.orders_orders"
},
{
"v" : 1,
"key" : {
"pre_mongified_id" : 1
},
"name" : "pre_mongified_id_1",
"ns" : "elf.orders_orders"
},
{
"v" : 1,
"key" : {
"id_user" : 1
},
"name" : "id_user_1",
"ns" : "elf.orders_orders"
},
{
"v" : 1,
"key" : {
"entries.id_target" : 1,
"entries.type" : 1
},
"name" : "entries.id_target_1_entries.type_1",
"ns" : "elf.orders_orders",
"background" : true
},
{
"v" : 1,
"key" : {
"number" : 1
},
"name" : "number_1",
"ns" : "elf.orders_orders"
}
]
users_users sample
> db.users_users.find().limit(2).pretty()
{
"_id" : ObjectId("56c46f6eae6f960fb6f59107"),
"id_civilitytitle" : 2,
"date_creation" : ISODate("2008-09-05T18:17:42Z"),
"date_update" : null,
"firstname" : "xxx",
"lastname" : "YYY",
"email" : "xxx#xxx.fr",
"phone" : "xxxxxxxxxx",
"mobile" : null,
"fax" : "",
"disabled" : false,
"confirmed" : true,
"date_birth" : null,
"locale" : "fr_FR",
"users_addresses" : [
{
"id_country" : ObjectId("56c43401ae6f960fb6000396"),
"name" : "Adresse",
"fullname" : "YYY xxx",
"address1" : "xxx",
"address2" : null,
"city" : "xxx",
"postal_code" : "11610",
"country_iso2" : "FR"
}
]
}
{
"_id" : ObjectId("56c46f6eae6f960fb6f59108"),
"id_civilitytitle" : 2,
"date_creation" : ISODate("2008-09-06T14:38:59Z"),
"date_update" : null,
"firstname" : "aaa",
"lastname" : zzz",
"email" : "xxx#xxx.fr",
"phone" : "xx xx xx xx xx",
"mobile" : null,
"fax" : "",
"disabled" : false,
"confirmed" : true,
"date_birth" : null,
"locale" : "fr_FR",
"users_addresses" : [
{
"id_country" : ObjectId("56c43401ae6f960fb6000396"),
"name" : "Adresse",
"fullname" : "aaa zzz",
"address1" : "xxx",
"address2" : null,
"city" : "xxx",
"postal_code" : "59180",
"country_iso2" : "FR"
}
]
}
orders_orders sample
> db.orders_orders.find().skip(5).limit(2).pretty()
{
"_id" : ObjectId("56c46ccfae6f960fb6dfe9c3"),
"id_user" : ObjectId("56c46f6eae6f960fb6f59109"),
"date_creation" : ISODate("2008-09-09T08:21:56Z"),
"number" : "c000026",
"tax_inclusive_amount" : 10,
"shipping_fees" : 5.95,
"paid" : null,
"cancelled" : "cancelled",
"locale" : null,
"from_mobile" : false,
"entries" : [
{
"_id" : ObjectId("56c4340dae6f960fb60008b5"),
"id_order" : ObjectId("56c46ccfae6f960fb6dfe9c3"),
"id_target" : 58,
"type" : "reference",
"quantity" : 1,
"reference" : "#4203",
"name" : "XXX",
"tax_inclusive_price_unit" : 1,
"tax_inclusive_price_total" : 1,
"tax_rates" : "a:1:{i:0;O:38:\"Catalog_Model_References_Container_Tax\":5:{s:7:\"\u0000*\u0000rate\";d:0.196000000000000007549516567451064474880695343017578125;s:7:\"\u0000*\u0000name\";s:6:\"19.60%\";s:7:\"\u0000*\u0000type\";s:32:\"cbf1c9560e4d3dbae5d65339aefed7b0\";s:13:\"\u0000*\u0000proportion\";d:1;s:8:\"\u0000*\u0000value\";N;}}",
"weight" : null
},
{
"_id" : ObjectId("56c4340dae6f960fb60008be"),
"id_order" : ObjectId("56c46ccfae6f960fb6dfe9c3"),
"id_target" : 247,
"type" : "reference",
"quantity" : 1,
"reference" : "#1711",
"name" : "XXX",
"tax_inclusive_price_unit" : 1,
"tax_inclusive_price_total" : 1,
"tax_rates" : "a:1:{i:0;O:38:\"Catalog_Model_References_Container_Tax\":5:{s:7:\"\u0000*\u0000rate\";d:0.196000000000000007549516567451064474880695343017578125;s:7:\"\u0000*\u0000name\";s:6:\"19.60%\";s:7:\"\u0000*\u0000type\";s:32:\"cbf1c9560e4d3dbae5d65339aefed7b0\";s:13:\"\u0000*\u0000proportion\";d:1;s:8:\"\u0000*\u0000value\";N;}}",
"weight" : null
}
]
}
{
"_id" : ObjectId("56c46ccfae6f960fb6dfe9c4"),
"id_user" : ObjectId("56c46f6eae6f960fb6f5911d"),
"date_creation" : ISODate("2008-09-09T12:32:40Z"),
"number" : "c000027",
"tax_inclusive_amount" : 15,
"shipping_fees" : 5.95,
"paid" : "paid",
"cancelled" : null,
"locale" : null,
"from_mobile" : false,
"entries" : [
{
"_id" : ObjectId("56c4340dae6f960fb60008bf"),
"id_order" : ObjectId("56c46ccfae6f960fb6dfe9c4"),
"id_target" : 105,
"type" : "reference",
"quantity" : 1,
"reference" : "#9011",
"name" : "XXX",
"tax_inclusive_price_unit" : 1,
"tax_inclusive_price_total" : 1,
"tax_rates" : "a:1:{i:0;O:38:\"Catalog_Model_References_Container_Tax\":5:{s:7:\"\u0000*\u0000rate\";d:0.196000000000000007549516567451064474880695343017578125;s:7:\"\u0000*\u0000name\";s:6:\"19.60%\";s:7:\"\u0000*\u0000type\";s:32:\"cbf1c9560e4d3dbae5d65339aefed7b0\";s:13:\"\u0000*\u0000proportion\";d:1;s:8:\"\u0000*\u0000value\";N;}}",
"weight" : null
},
{
"_id" : ObjectId("56c435b0ae6f960fb614c240"),
"id_order" : ObjectId("56c46ccfae6f960fb6dfe9c4"),
"id_target" : 364,
"type" : "reference",
"quantity" : 1,
"reference" : "#1710",
"name" : "xxx",
"tax_inclusive_price_unit" : 1,
"tax_inclusive_price_total" : 1,
"tax_rates" : "a:1:{i:0;O:38:\"Catalog_Model_References_Container_Tax\":5:{s:7:\"\u0000*\u0000rate\";d:0.196000000000000007549516567451064474880695343017578125;s:7:\"\u0000*\u0000name\";s:6:\"19.60%\";s:7:\"\u0000*\u0000type\";s:32:\"cbf1c9560e4d3dbae5d65339aefed7b0\";s:13:\"\u0000*\u0000proportion\";d:1;s:8:\"\u0000*\u0000value\";N;}}",
"weight" : null
}
]
}
expected output
Multiple data from users_users for each lines (firstname, lastname, email, birth_date, locale, ...)
The reason why the query is slow is because further queries on the documents retrieved with the $lookup operator do not use indexes.
$max: "$orders.date_creation" in particular won't be indexed so it will do a full scan to retrieve this.

Elasticsearch: sum of total term frequency in ONE document

I need sumttf of ONE document in a field. However I can get sumttf of all documents only...
I need to be able to access the variable in script like _index['field'].sumttf() of that particular document. This is what I've got so far.
Mapping:
{"document2" : {
"mappings" : {
"document2" : {
"_all" : {
"enabled" : false
},
"properties" : {
"content" : {
"type" : "string",
"term_vector" : "yes",
"fields" : {
"with_shingles" : {
"type" : "string",
"analyzer" : "my_shingle_analyzer"
}
}
},
...
Term vector:
"_index" : "document2",
"_type" : "document2",
"_id" : "709718",
"_version" : 1,
"term_vectors" : {
"content" : {
"field_statistics" : {
"sum_doc_freq" : 60676474,
"doc_count" : 198373,
"sum_ttf" : 224960172
},
terms" : {
"0" : {
"term_freq" : 8
},
"0.5" : {
"term_freq" : 1
},
"003a0e45ea07a" : {
"term_freq" : 1
},
"005" : {
"term_freq" : 1
},
"0081989" : {
"term_freq" : 1
},
"01" : {
"term_freq" : 1
},
"01.08.2002" : {
"term_freq" : 1
},
...

Resources