I'm developping a search engine for my client which has to use synonym expansion. I can properly setup my index with a synonym token filter and a custom file (synonym.txt).
Example: ipod, i-pod, i pod
However, whenever we want a synonym expansion, I get the synonyms from Elasticsearch and display them as tags on the website.
Each tag can be unselected. In this case, how can we specify while querying to Elasticsearch, to use a different set of synonyms, not coming from the synonym.txt file ?
Example: if the user is looking for the term ipod, then I will show these two tags: i-pod, i pod. But if the user chooses to unselect "i-pod", I would like to be able to specify that only "i pod" is a synonym of "ipod" while querying.
My index settings are :
{
"settings": {
"analysis": {
"filter": {
"elision": {
"type": "elision",
"articles": ["l", "m", "t", "qu", "n", "s", "j", "d", "c", "jusqu", "quoiqu", "lorsqu", "puisqu"]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
},
"synonymsFilter": {
"type" : "synonym",
"synonyms_path" : "analysis/synonym.txt"
},
"autocompleteFilter": {
"max_shingle_size": "5",
"min_shingle_size": "2",
"type": "shingle"
}
},
"analyzer": {
"default": {
"tokenizer": "letter",
"filter": ["asciifolding", "lowercase", "french_stemmer", "elision", "french_stop"]
},
"auto-complete-suggester": {
"filter": [
"lowercase",
"autocompleteFilter"
],
"char_filter": [
"html_strip"
],
"type": "custom",
"tokenizer": "standard"
},
"did-you-mean-suggester": {
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase"]
},
"synonym_analyzer" : {
"tokenizer" : "whitespace",
"filter" : ["synonymsFilter"]
},
"synonym_analyzer2": {
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase", "french_stop", "autocompleteFilter"]
}
}
}
},
"mappings": {
"companies": {
"date_detection": "false",
"properties": {
"auto_complete": {
"type": "string",
"analyzer": "auto-complete-suggester",
"term_vector" : "yes"
},
"did_you_mean": {
"type": "string",
"analyzer": "did-you-mean-suggester",
"term_vector" : "yes"
},
"synonyms": {
"type": "string",
"analyzer": "synonym_analyzer",
"term_vector" : "yes"
},
"company_name": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"siren": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"CPposteEntreprise": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"commercial_company_name": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"year_creation_company": {
"type": "long"
},
"month_creation_company": {
"type": "long"
},
"month_year_creation_company": {
"type": "date",
"format": "yyyyMM",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"city_company": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
},
"departement_company": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
},
"region_company": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
},
"is_excellence": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"interlocuteurs": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" },
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
},
"flag_entreprise_finance": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"flag_indirect": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"flag_direct": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"flag_investissement": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"montant_total_investissement": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"motant_total_finance": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"nombre_investissement": {
"type": "long",
"fields": {
"raw": { "type": "long", "index": "not_analyzed" }
}
},
"nombre_financement_accorde": {
"type": "long",
"fields": {
"raw": { "type": "long", "index": "not_analyzed" }
}
},
"caInterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"caExterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"caFiltre": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"effectif": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"textRank": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
},
"term_vector" : "yes",
"copy_to": [
"synonyms"
]
},
"masterKeywords": {
"type": "nested",
"properties": {
"keyword": {
"type":"string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
}
}
},
"dossiers":{
"type": "nested",
"date_detection": "false",
"properties": {
"dossierCommercial": {
"type": "long"
},
"sousDossierCommercial": {
"type": "long"
},
"historiqueProduitBPI": {
"type": "string"
},
"statutSousDossier": {
"type": "string"
},
"dateDecision": {
"type": "date",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" },
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"nomChargesAffaires": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"contactChargesAffaires": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"montantAide": {
"type": "double",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" },
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"contentValidation": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"contentDecision": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"contentDirectionEngagements": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"metaDomain": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"sousSecteur": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
}
},
"keywords": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
},
"descriptionProjet": {
"type": "string",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" }
},
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
}
}
},
"investissements": {
"type": "nested",
"date_detection": "false",
"properties": {
"flag_indirect": {
"type": "string",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"nom_societe_gestion_svi":{
"type": "string"
},
"date_entree_investissement":{
"type": "date",
"fields": {
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"montant_investissement_df":{
"type": "double"
},
"description_projet_investissement":{
"type": "string",
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
}
}
},
"bilans":{
"type": "nested",
"date_detection": "false",
"properties": {
"bilanAnneeN": {
"properties": {
"effectif": {
"type": "long",
"fields": {
"raw": { "type": "long", "index": "not_analyzed" }
}
},
"capital": {
"type": "double"
},
"resultatNet": {
"type": "double"
},
"clotureDate": {
"type": "date"
},
"annee": {
"type": "long"
},
"ebeMoyen": {
"type": "double"
},
"caInterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"caExterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
}
}
},
"bilanAnneeN1": {
"properties": {
"effectif": {
"type": "long",
"fields": {
"raw": { "type": "long", "index": "not_analyzed" }
}
},
"capital": {
"type": "double"
},
"resultatNet": {
"type": "double"
},
"clotureDate": {
"type": "date"
},
"annee": {
"type": "long"
},
"ebeMoyen": {
"type": "double"
},
"caInterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"caExterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
}
}
},
"bilanAnneeN2": {
"properties": {
"effectif": {
"type": "long",
"fields": {
"raw": { "type": "long", "index": "not_analyzed" }
}
},
"capital": {
"type": "double"
},
"resultatNet": {
"type": "double"
},
"clotureDate": {
"type": "date"
},
"annee": {
"type": "long"
},
"ebeMoyen": {
"type": "double"
},
"caInterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
},
"caExterne": {
"type": "double",
"fields": {
"raw": { "type": "double", "index": "not_analyzed" }
}
}
}
}
}
},
"news": {
"type": "nested",
"date_detection": "false",
"properties": {
"date": {
"type": "date",
"fields": {
"suggester": { "type": "string", "analyzer": "did-you-mean-suggester" },
"raw": { "type": "string", "index": "not_analyzed" }
}
},
"description": {
"type": "string",
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"title": {
"type": "string"
},
"content": {
"type": "string",
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete",
"synonyms"
]
},
"url": {
"type": "string"
},
"tags": {
"type": "string",
"term_vector" : "yes",
"copy_to": [
"did_you_mean",
"auto_complete"
]
},
"links": {
"type": "string"
},
"external_source": {
"type": "string"
}
}
}
}
}
}
}
For now, I am using a master field called "synonyms". Is this a good idea ?
Thanks in advance for your help.
Related
i am having trouble forming query to fetch all values with sql group by kind of thing.
so below is my data structure:
product index:
{
"createdBy" : "61c1fcdd88dbad1920da8caf",
"creationTime" : "2021-12-22T11:58:53.576932Z",
"lastModifiedBy" : "61c1fcdd88dbad1920da8caf",
"lastModificationTime" : "2021-12-22T11:58:53.576932Z",
"id" : "61c312fdc6aa620a609db0b2",
"title" : "string",
"brand" : "string",
"longDesc" : "string",
"categoryId" : "string",
"imageUrls" : [
"string",
"string"
],
"keySpecs" : [
"string",
"string",
],
"facets" : [
{
"name" : "color",
"value" : "red"
},
{
"name" : "storage",
"value" : "16 GB"
},
{
"name" : "brand",
"value" : "Intex"
}
],
"categoryName" : "handsets"
}
Now, i want to fetch all the facets with their different values and count as well. Let's say
productA has color blue, productB has color red
productA has brand ABC, productB has brand XYZ
so, i want data which list all facets like:
color: blue(200 count), red (12 count)
brand: ABC(13 count), XYZ (99 count)
Also, different product will have different type of facet, like iphone will have color memory brand size, but a pen will have color and brand only (not memory/size).
Note: i'm using latest version of elastic
=================
UPDATE 1:
Below is the es mapping details
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": [
"example"
]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"lalashree_standard_analyzer": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
},
"html_standard_analyzer": {
"char_filter": [
"html_strip"
],
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "keyword"
},
"createdBy": {
"type": "keyword"
},
"creationTime": {
"type": "date"
},
"lastModifiedBy": {
"type": "keyword"
},
"lastModificationTime": {
"type": "date"
},
"deleted": {
"type": "boolean"
},
"deletedBy": {
"type": "keyword"
},
"deletionTime": {
"type": "date"
},
"title": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"suggest": {
"type": "completion"
}
}
},
"shortDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"longDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"categoryId": {
"type": "keyword"
},
"searchDetails": {
"type": "object",
"properties": {
"desc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"keywords": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"imageUrls": {
"type": "keyword",
"index": false
},
"keySpecs": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"sections": {
"type": "object",
"properties": {
"name": {
"type": "text",
"index": false
},
"shortDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"longDesc": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
},
"htmlContent": {
"type": "text",
"analyzer": "html_standard_analyzer"
}
}
},
"facets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"specificationItems": {
"type": "object",
"properties": {
"key": {
"type": "text",
"analyzer": "lalashree_standard_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"values": {
"type": "text",
"analyzer": "lalashree_standard_analyzer"
}
}
},
"categoryName": {
"type": "keyword"
},
"productFamily": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"familyVariantOptions": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"values": {
"type": "keyword"
}
}
},
"productFamilyItems": {
"type": "nested",
"properties": {
"baseProductId": {
"type": "keyword"
},
"itemVariantInfoSet": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
}
}
}
},
"rating": {
"type": "float"
},
"totalReviewsCount": {
"type": "long"
},
"stores": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"logo": {
"type": "keyword",
"index": false
},
"active": {
"type": "boolean"
},
"name": {
"type": "text"
},
"quantity": {
"type": "long"
},
"rating": {
"type": "float"
},
"totalReviewsCount": {
"type": "long"
},
"price.mrp": {
"type": "float"
},
"price.sp": {
"type": "float"
},
"location.geoPoint": {
"type": "geo_point"
},
"oos": {
"type": "boolean"
}
}
}
}
}
}
This query first group by names then groups each name's values. By setting sizes, you can arrange number of facets you want and number of items in each facet. I think it does what you need.
Note that if you have too many documents and if performance matters, this query may perform bad.
{
"size": 0,
"aggs": {
"facets": {
"nested": {
"path": "facets"
},
"aggs": {
"names": {
"terms": {
"field": "facets.name",
"size": 10
},
"aggs": {
"values": {
"terms": {
"field": "facets.value",
"size": 10
}
}
}
}
}
}
}
}
I am trying to create an index with a custom default analyzer.
I already checked the following questions:
Analyzer not found exception while creating an index with mapping and settings
How to specify an analyzer while creating an index in ElasticSearch
mapper_parsing_exception for a custom analyzer while creating index in elasticsearch?
but they didn't solve the issue.
Here is my schema:
put /emails
{
"mappings": {
"email": {
"analyzer": "lkw",
"properties": {
"createdOn": {
"type": "date",
"store": true,
"format": "strict_date_optional_time||epoch_millis"
},
"data": {
"type": "object",
"dynamic": "true"
},
"from": {
"type": "string",
"store": true
},
"id": {
"type": "string",
"store": true
},
"sentOn": {
"type": "date",
"store": true,
"format": "strict_date_optional_time||epoch_millis"
},
"sesId": {
"type": "string",
"store": true
},
"subject": {
"type": "string",
"store": true,
"analyzer": "standard"
},
"templates": {
"properties": {
"html": {
"type": "string",
"store": true
},
"plainText": {
"type": "string",
"store": true
}
}
},
"to": {
"type": "string",
"store": true
},
"type": {
"type": "string",
"store": true
}
}
},
"event": {
"_parent": {
"type": "email"
},
"analyzer": "lkw",
"properties": {
"id": {
"type": "string",
"store": true
},
"origin": {
"type": "string",
"store": true
},
"time": {
"type": "date",
"store": true,
"format": "strict_date_optional_time||epoch_millis"
},
"type": {
"type": "string",
"store": true
},
"userAgent": {
"type": "string",
"store": true
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"lkw": {
"tokenizer": "keyword",
"filter": [
"lowercase"
],
"type": "custom"
}
}
}
}
}
When I execute the command above, I get this error:
{
"error": {
"root_cause": [
{
"type": "mapper_parsing_exception",
"reason": "Root mapping definition has unsupported parameters: [analyzer : lkw]"
}
],
"type": "mapper_parsing_exception",
"reason": "Failed to parse mapping [event]: Root mapping definition has unsupported parameters: [analyzer : lkw]",
"caused_by": {
"type": "mapper_parsing_exception",
"reason": "Root mapping definition has unsupported parameters: [analyzer : lkw]"
}
},
"status": 400
}
Since you have only a few string fields, I suggest you simply specify your lkw analyzer where you need it, just like you did for the standard one:
PUT /emails
{
"mappings": {
"email": {
"properties": {
"createdOn": {
"type": "date",
"store": true,
"format": "strict_date_optional_time||epoch_millis"
},
"data": {
"type": "object",
"dynamic": "true"
},
"from": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"id": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"sentOn": {
"type": "date",
"store": true,
"format": "strict_date_optional_time||epoch_millis"
},
"sesId": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"subject": {
"type": "string",
"store": true,
"analyzer": "standard"
},
"templates": {
"properties": {
"html": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"plainText": {
"type": "string",
"store": true,
"analyzer": "lkw"
}
}
},
"to": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"type": {
"type": "string",
"store": true,
"analyzer": "lkw"
}
}
},
"event": {
"_parent": {
"type": "email"
},
"properties": {
"id": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"origin": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"time": {
"type": "date",
"store": true,
"format": "strict_date_optional_time||epoch_millis"
},
"type": {
"type": "string",
"store": true,
"analyzer": "lkw"
},
"userAgent": {
"type": "string",
"store": true,
"analyzer": "lkw"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"lkw": {
"tokenizer": "keyword",
"filter": [
"lowercase"
],
"type": "custom"
}
}
}
}
}
I'm trying to implement an auto-suggest control powered by an ES index. The index has multiple fields (Multi-language - Arabic and English) and I want to be able to search in all languages.
The easiest way to do that is NGram with the "_all" field, as long as some care is taken in the mapping definition. The issue we have now how to accomplish this using multi-language.
PS: We are looking to separate field for all the possible languages (Using one index).
I tried to use the nGram tokenizer and filter and it's working good for one language (English).
{
"template": "index_com",
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1,
"analysis": {
"filter": {
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 20
}
},
"analyzer": {
"edgeNGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"edgeNGram_filter"
]
}
}
}
},
"mappings": {
"product": {
"_all": {
"enabled": true,
"index_analyzer": "edgeNGram_analyzer",
"search_analyzer": "standard"
},
"properties": {
"id": {
"type": "string",
"index": "no",
"include_in_all": false
},
"uuid": {
"type": "string",
"index": "no",
"include_in_all": false
},
"name": {
"type": "string",
"include_in_all": true
},
"description": {
"type": "string",
"include_in_all": true
},
"brand": {
"type": "string",
"include_in_all": true
},
"made_id": {
"type": "string",
"include_in_all": true
},
"category": {
"type": "string",
"include_in_all": true
},
"category_id": {
"type": "integer",
"include_in_all": false
},
"keywords": {
"type": "string",
"include_in_all": true
},
"colors": {
"type": "string",
"index": "not_analyzed"
},
"colors_name": {
"type": "string",
"include_in_all": true
},
"quality": {
"type": "string",
"index": "not_analyzed"
},
"vendor_name": {
"type": "string",
"include_in_all": false
},
"vendor_location" : {
"type" : "geo_point",
"include_in_all": false
},
"price": {
"type": "double",
"include_in_all": false
},
"price_before_discount": {
"type": "double",
"include_in_all": false
},
"is_deal": {
"type": "integer",
"include_in_all": false
},
"is_best_seller": {
"type": "integer",
"include_in_all": false
},
"views": {
"type": "integer",
"include_in_all": false
},
"rating": {
"type": "integer",
"include_in_all": false
},
"updated_at": {
"type": "date",
"format": "dateOptionalTime"
},
"created_at": {
"type": "date",
"format": "dateOptionalTime"
},
"image_link": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
Arabic analyzer:
{
"settings": {
"analysis": {
"filter": {
"arabic_stop": {
"type": "stop",
"stopwords": "_arabic_"
},
"arabic_keywords": {
"type": "keyword_marker",
"keywords": []
},
"arabic_stemmer": {
"type": "stemmer",
"language": "arabic"
}
},
"analyzer": {
"arabic": {
"tokenizer": "standard",
"filter": [
"lowercase",
"arabic_stop",
"arabic_normalization",
"arabic_keywords",
"arabic_stemmer"
]
}
}
}
}
}
can someone suggest any solution? Thanks!
Your second snippet defines the arabic analyzer, which is already available so you shouldn't need to add it.
What you are missing is to tell elasticsearch to also use the arabic analyzer. So you want to analyze each field twice, in english and arabic. To do that, add
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
to all your fields that have "include_in_all": true. That makes your mappings look like this:
{
"template": "index_com",
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1,
"analysis": {
"filter": {
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 20
}
},
"analyzer": {
"edgeNGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"edgeNGram_filter"
]
}
}
}
},
"mappings": {
"product": {
"_all": {
"enabled": true,
"index_analyzer": "edgeNGram_analyzer",
"search_analyzer": "standard"
},
"properties": {
"id": {
"type": "string",
"index": "no",
"include_in_all": false
},
"uuid": {
"type": "string",
"index": "no",
"include_in_all": false
},
"name": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"description": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"brand": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"made_id": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"category": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"category_id": {
"type": "integer",
"include_in_all": false
},
"keywords": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"colors": {
"type": "string",
"index": "not_analyzed"
},
"colors_name": {
"type": "string",
"include_in_all": true,
"fields": {
"ar": {
"type": "string",
"analyzer": "arabic"
},
"en": {
"type": "string",
"analyzer": "english"
}
}
},
"quality": {
"type": "string",
"index": "not_analyzed"
},
"vendor_name": {
"type": "string",
"include_in_all": false
},
"vendor_location": {
"type": "geo_point",
"include_in_all": false
},
"price": {
"type": "double",
"include_in_all": false
},
"price_before_discount": {
"type": "double",
"include_in_all": false
},
"is_deal": {
"type": "integer",
"include_in_all": false
},
"is_best_seller": {
"type": "integer",
"include_in_all": false
},
"views": {
"type": "integer",
"include_in_all": false
},
"rating": {
"type": "integer",
"include_in_all": false
},
"updated_at": {
"type": "date",
"format": "dateOptionalTime"
},
"created_at": {
"type": "date",
"format": "dateOptionalTime"
},
"image_link": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
I`m new in elasticsearch and I have problem.
I have 1 million rows of data and query result take too long.
Went I have 150k it was taking 0.5s , now is taking 10sec.
Each days, number of data is different (One day can be 150k, other 1 million and etc.)
I need advice how to make it faster.
Mapping
{
"mappings": {
"Jobs": {
"_ttl": {
"enabled": true,
"default": "1d"
},
"properties": {
"id": {
"type": "integer"
},
"advertiser_id": {
"type": "integer"
},
"company_id": {
"type": "integer"
},
"feed_id": {
"type": "integer"
},
"description_unique": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"title": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"city": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"county": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"country": {
"type": "integer"
},
"description": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"store": true
}
}
},
"company": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"url": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"premium": {
"type": "integer"
},
"bid": {
"type": "integer"
},
"created": {
"type": "date",
"format": "dateOptionalTime",
"default": "basic_date"
},
"updated": {
"type": "date",
"format": "dateOptionalTime"
}
}
}
}
}
Query
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "Survey Developer",
"type": "best_fields",
"fields": [
"title",
"description"
],
"operator": "and"
}
}
]
}
},
"highlight": {
"boundary_chars": ".,!? \t\n",
"tag_schema": "styled",
"pre_tags": [
"<b>"
],
"post_tags": [
"</b>"
],
"fields": {
"description": {
"fragment_size": 200,
"number_of_fragments": 3
}
}
},
"sort": [
{
"premium": {
"order": "desc"
}
},
{
"bid": {
"order": "desc"
}
}
]
}
Server parameters:
CPU 1 vCPU
RAM 1 GB
System Disk 40 GB
Network 120 Mb/s
I have the following mapping on my index in elasticsearch.
{
"mail": {
"properties": {
"project": {
"type": "string",
"index": "not_analyzed",
"null_value": "na",
"include_in_all": "false"
},
"mailbox": {
"type": "string",
"index": "not_analyzed",
"null_value": "#na",
"store" : "yes"
},
"path": {
"type": "string",
"index": "not_analyzed",
"null_value": "#na",
"store" : "yes"
},
"messageid": {
"type": "string",
"index": "not_analyzed",
"null_value": "na",
"include_in_all": "false"
},
"nodeid":
{
"type": "string",
"index": "not_analyzed",
"null_value": "na",
"include_in_all": "false",
"store" : "yes"
},
"replyto": {
"type": "string",
"index": "not_analyzed",
"null_value": "na",
"include_in_all": "false"
},
"references": {
"type": "string",
"index": "not_analyzed",
"null_value": "na",
"include_in_all": "false"
},
"subject": {
"boost": "3.0",
"type": "string",
"term_vector": "with_positions_offsets",
"analyzer": "snowball"
},
"from": {
"type": "nested",
"properties": {
"name": {
"type" : "multi_field",
"fields" : {
"name" : {"type" : "string", "analyzer" : "standard", "index" : "analyzed"},
"untouched" : {"type" : "string", "index" : "not_analyzed"}
}
},
"address": {
"type": "string",
"analyzer": "analyzer_email"
},
"nodeid": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
},
"facet": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
}
}
},
"to": {
"type": "nested",
"properties":{
"name": {
"type": "string",
"analyzer":"analyzer_keyword"
},
"address": {
"type": "string",
"analyzer": "analyzer_email"
},
"nodeid": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
},
"facet": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
}
}
},
"cc": {
"type": "nested",
"properties":{
"name": {
"type": "string",
"analyzer":"analyzer_keyword"
},
"address": {
"type": "string",
"analyzer": "analyzer_email"
},
"nodeid": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
},
"facet": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
}
}
},
"bcc": {
"type": "nested",
"properties":{
"name": {
"type": "string",
"analyzer":"analyzer_keyword"
},
"address": {
"type": "string",
"analyzer": "analyzer_email"
},
"nodeid": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
},
"facet": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
}
}
},
"message_snippet": {
"type": "string",
"index": "no",
"include_in_all": "false"
},
"text_messages": {
"type": "string",
"store": "yes",
"term_vector": "with_positions_offsets",
"analyzer": "snowball"
},
"html_messages": {
"type": "string",
"store": "yes",
"term_vector": "with_positions_offsets",
"analyzer": "snowball"
},
"message_attachments": {
"dynamic": "true",
"properties":{
"filename":{
"type": "string",
"store": "yes"
},
"content":{
"type": "string",
"store": "yes",
"term_vector": "with_positions_offsets",
"analyzer": "snowball"
},
"hash":{
"type": "string",
"store": "yes",
"analyzer": "analyzer_keyword"
},
"nodeid":{
"type": "string"
}
}
},
"date": {
"type": "date"
},
"entities": {
"type": "nested",
"properties": {
"name": {
"type": "string",
"analyzer": "analyzer_keyword"
},
"type": {
"type": "string",
"analyzer": "analyzer_keyword"
},
"nodeid":{
"type": "string"
},
"facet": {
"type": "string",
"index": "not_analyzed",
"store" : "yes"
}
}
}
}
}
}
And I try searching on the mail.from.name field with the following query, but I doesn't give me any results.
{
"query": {
"nested": {
"path": "from",
"query": {
"term": {
"name": "mark"
}
}
}
}
}
What is wrong about my mapping or query?
A sample document looks like this
{
"project": "test",
"mailbox": "test.pst",
"messageid": "5e667f7f-4421-4836-91f3-8b5216c04839",
"nodeid": "671",
"subject": "No Subject",
"from": [
{
"name": "Mike Johnson",
"address": "mike#gmail.com",
"nodeid": "3",
"facet": "Mike Johnson"
}
],
"to": [
{
"name": "John Doe",
"address": "JDoe#gmail.com",
"nodeid": "367",
"facet": "John Doe"
}
],
"cc": [],
"bcc": [],
"textbody": "this is a test email with no further lines of text",
"htmlbody": "",
"snippet": "",
"transmitted": "",
"replyto": "",
"references": "",
"attachments": [],
"entities": [
{
"name": "google",
"type": "organization",
"nodeid": "656",
"facet": "google"
}
],
"domains": [
"google.com"
],
"path": ""
}
You need to address the nested objects name in your query again
{
"query": {
"nested": {
"path": "from",
"query": {
"term": {
"from.name": "mike"
}
}
}
}
}