How to index files with Elasticsearch, Ingest-Mapper and NEST - elasticsearch

actually I am trying to index some files (most important PDFs) with Elasticsearch and NEST client from ASP.net Core 1.0.
I found some code snippets and tried to use it for my purpose.
Those are the code segments that I use to set up the Elasticsearch index and the Pipeline:
private void SetupElasticSearch()
{
ConnectionSettings settings = new ConnectionSettings(new Uri(_appSettings.ElasticSearchSettings.Url))
.MapDefaultTypeIndices(m => m
.Add(typeof(FSDocumentFile), _appSettings.ElasticSearchSettings.IndexName)
).OnRequestCompleted(response =>
{
_logger.LogInformation(String.Format("{0} {1}", response.HttpMethod, response.Uri));
if (response.RequestBodyInBytes != null)
{
_logger.LogInformation(Encoding.UTF8.GetString(response.RequestBodyInBytes));
}
// log out the response and the response body, if one exists for the type of response
_logger.LogInformation(String.Format("{0}", response.HttpStatusCode));
if (response.ResponseBodyInBytes != null)
{
_logger.LogInformation(Encoding.UTF8.GetString(response.ResponseBodyInBytes));
}
});
ElasticClient client = new ElasticClient(settings);
CreateElasticSearchIndex(client);
CreatePipeline(client);
}
private void CreateElasticSearchIndex(ElasticClient Client)
{
Client.DeleteIndex(_appSettings.ElasticSearchSettings.IndexName);
Client.CreateIndex(_appSettings.ElasticSearchSettings.IndexName, c => c
.Settings(s => s
.Analysis(a => a
.Analyzers(ad => ad
.Custom("windows_path_hierarchy_analyzer", ca => ca
.Tokenizer("windows_path_hierarchy_tokenizer")
)
)
.Tokenizers(t => t
.PathHierarchy("windows_path_hierarchy_tokenizer", ph => ph
.Delimiter('\\')
)
)
)
)
.Mappings(m => m
.Map<FSDocumentFile>(mp => mp
.AllField(all => all
.Enabled(false)
)
.Properties(ps => ps
.Number(n => n
.Name(nn => nn.Id)
)
.Text(s => s
.Name(n => n.Comment)
//.Analyzer("windows_path_hierarchy_analyzer")
)
//.Text(s => s
// .Name(n => n.Content)
//)
.Object<Attachment>(a => a
.Name(n => n.FileData)
.Properties(p => p
.Text(t => t
.Name(n => n.Name)
)
.Text(t => t
.Name(n => n.Content)
)
.Text(t => t
.Name(n => n.ContentType)
)
.Number(n => n
.Name(nn => nn.ContentLength)
)
.Date(d => d
.Name(n => n.Date)
)
.Text(t => t
.Name(n => n.Author)
)
.Text(t => t
.Name(n => n.Title)
)
.Text(t => t
.Name(n => n.Keywords)
)
)
)
)
)
)
);
}
private void CreatePipeline(ElasticClient Client)
{
Client.PutPipeline("attachments", p => p
.Description("Document attachment pipeline")
.Processors(pr => pr
.Attachment<FSDocumentFile>(a => a
.Field(f => f.FileData.Content)
.TargetField(f => f.Content)
)
.Remove<FSDocumentFile>(r => r
.Field(f => f.FileData)
)
)
);
}
This is the definition of the the FSFileInfo classe that is used for indexing:
[ElasticsearchType(Name = "FSDocumentFile")]
public class FSDocumentFile
{
public int Id { get; set; }
/// <summary>
/// FileData Base64 encoded
/// </summary>
public string Content { get; set; }
[Attachment(Store = true)]
public Attachment FileData { get; set; }
public string Comment { get; set; }
}
This is the code that I use to index the file:
FSDocumentFile fsFile = new FSDocumentFile()
{
Id = df.DocumentFileID,
FileData = new Attachment()
{
Content = Convert.ToBase64String(fd.FileBytes),
ContentType = "application/pdf",
ContentLength = fd.FileBytes.Count(),
Name = fileName
},
Comment = "TEst Comment" + df.DocumentFileID.ToString()
};
ElasticClient client = new ElasticClient(settings);
Result callResult = client.Index<FSDocumentFile>(fsFile, fi => fi.Pipeline("attachments")).Result;
It always results in an error on Elasticsearch that says:
[2016-11-28T15:40:28,311][ERROR][o.e.a.i.IngestActionFilter] [mU3hlQ7] failed to execute pipeline [attachments]
org.elasticsearch.ElasticsearchException: java.lang.IllegalArgumentException: ElasticsearchParseException[Error parsing document in field [fileData.content]]; nested: IllegalArgumentException[field [c
ontent] not present as part of path [fileData.content]];
at org.elasticsearch.ingest.CompoundProcessor.newCompoundProcessorException(CompoundProcessor.java:156) ~[elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.ingest.CompoundProcessor.execute(CompoundProcessor.java:107) ~[elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.ingest.Pipeline.execute(Pipeline.java:58) ~[elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.ingest.PipelineExecutionService.innerExecute(PipelineExecutionService.java:166) ~[elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.ingest.PipelineExecutionService.access$000(PipelineExecutionService.java:41) ~[elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.ingest.PipelineExecutionService$1.doRun(PipelineExecutionService.java:65) [elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:504) [elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-5.0.0-rc1.jar:5.0.0-rc1]
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) [?:1.8.0_112]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) [?:1.8.0_112]
at java.lang.Thread.run(Unknown Source) [?:1.8.0_112]
Caused by: java.lang.IllegalArgumentException: ElasticsearchParseException[Error parsing document in field [fileData.content]]; nested: IllegalArgumentException[field [content] not present as part of
path [fileData.content]];
Can someone please assist me? Or maybe point me to a good tutorial how to use Elasticsearch, Ingest-Mapper and NEST client together?
Thanks a lot in advance.

Related

Is it possible to have multiple queries match multiple fields?

I want to get all results which match the query "table" in the Title field and match the number "1" in the CategoryId field of my Product class. Can this be done in elasticsearch/nest?
public class ProductModel
{
public string Title { get; set; }
public int CategoryId { get; set; }
}
This is what I have now:
response = await ElasticClient.SearchAsync<ProductModel>(s => s
.From(skip)
.Size(itemsPerPage)
.Index(indexName)
.Query(q => q
.SimpleQueryString(qs => qs
.Fields(fs => fs
.Field(f => f.Title, 3.50)
)
.Query("" + productSearch.Query + "")
)
)
);
I want to get only the results which also have the value "1" in the CategoryId field.
Thanks to #Benjamin Trent
In case someone else should have a similar issue, I ended up using the following:
response = await ElasticClient.SearchAsync<ProductModel>(s => s
.From(skip)
.Size(itemsPerPage)
.Index(indexName)
.Query(q => q
.Bool(b => b
.Should(
bs => bs.Term(p => p.CategoryId, productSearch.CategoryId),
bs => bs.SimpleQueryString(qs => qs
.Fields(fs => fs
.Field(f => f.Title, 3.50)
.Field(f => f.BrandName, 3.00)
.Field(f => f.Description, 2.00)
)
.Query("" + productSearch.Query + "")
)
)
)
)
);

Not understanding the behavior of _id on ES

I have documents in a Mongo database, they use the _id field as an index.
I use Monstache to sync ES with Mongo's op log, so the documents in ES have the same _id field.
When searching a specific document, Kibana shows:
Tags:
tag1 testtag CreatedOn:
October 26th 2018, 14:25:57.053
_id:
FRaqDPIzWcVI2dl-oA9uUFHLVFQk8qIqqhySWSkM7Ds
_type:
testobject
_index:
test.object
_score:
0
but then a query with Nest, returns this in the Documents array:
_id = 0d5aa177-3066-4c6a-aaf5-9b887ae7297f
and when I look in the Hits array, I see:
Id = FRaqDPIzWcVI2dl-oA9uUFHLVFQk8qIqqhySWSkM7Ds
So in Documents, the _id is now an unrelated guid, but in Hits the _id is called Id and it has the right value.
Why is that, and is there a way to get the proper value for _id in Documents?
Edit: more info
This is the object; since it's shared by MongoDB and ES, it has attributes for both.
[Nest.ElasticsearchType, BsonIgnoreExtraElements]
public class TestObject
{
public string _id { get; set; }
public string OwnerId { get; set; }
public Flags Flags { get; set; }
[Nest.Text, BsonIgnoreIfDefault] public string Title { get; set; }
[Nest.Text] public string Tags { get; set; }
[Nest.Ignore] public string Hash { get; set; }
[Nest.Ignore, BsonIgnoreIfDefault] public string Link { get; set; }
}
This is the code creating the index:
private static void InitializeElasticSearch(string ConnectionString)
{
var Settings = new ConnectionSettings(new Uri(ConnectionString))
.DefaultIndex(_IndexName)
.DefaultFieldNameInferrer(_ => _)
.DefaultMappingFor<TestObject>(_ => _.Ignore(I => I._id));
_ElasticClient = new ElasticClient(Settings);
if (!_ElasticClient.IndexExists(_IndexName).Exists)
{
// create the index
var CreateIndexResponse = _ElasticClient.CreateIndex(_IndexName, C => C
.Settings(S => S
.Analysis(A => A
.CharFilters(Cf => Cf
.Mapping("expressions", E => E
.Mappings(TextLists.Expressions)
)
)
.TokenFilters(Tf => Tf
.Synonym("synonyms", Sy => Sy
.Synonyms(TextLists.Synonyms)
.Tokenizer("whitespace")
)
)
.Analyzers(An => An
.Custom("index", Ca => Ca
.CharFilters("expressions")
.Tokenizer("standard")
.Filters("standard", "synonyms", "stop")
)
)
)
)
.Mappings(M => M
.Map<TestObject>(Mm => Mm
.AutoMap()
.Properties(P => P
.Text(T => T
.Name(N => N.Title)
.Analyzer("index")
)
.Text(T => T
.Name(N => N.Tags)
.Analyzer("index")
)
)
)
)
);
Then, the query code:
var R = await _ElasticClient.SearchAsync<TestObject>(Sr => Sr
.Query(Q =>
{
// do we query 'all' ?
if (Terms == "*") return Q.MatchAll();
// or do we have a general query
return Q
.MultiMatch(Fu => Fu
.Fields(F => F
.Field(Ff => Ff.Tags)
.Field(Ff => Ff.Title)
)
.Query(Terms)
.Fuzziness(Fuzziness.EditDistance(2))
);
})
.Take(_MaxObjectReturned)
);

Does Elasticsearch Nest support Update By Query

I want to use the UpdateByQuery method on the high level client but can't find any documentation for Nest. They have great documentation if I wanted to make a CURL request but nothing for NEST. https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-update-by-query.html
If anyone has and example of them using it or can share documentation they have found that would be awesome!
Update By Query API is supported in NEST. Here's an example adapted from the integration tests. NEST Documentation for Index and Update APIs is planned :)
private static void Main()
{
var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
var settings = new ConnectionSettings(pool)
.DefaultMappingFor<Test>(m => m
.IndexName("tests")
.TypeName("test")
);
var client = new ElasticClient(settings);
var index = IndexName.From<Test>();
if (client.IndexExists(index).Exists)
client.DeleteIndex(index);
client.CreateIndex(index, c => c
.Mappings(m => m
.Map<Test>(map => map
.Properties(props => props
.Text(s => s.Name(p => p.Text))
.Keyword(s => s.Name(p => p.Flag))
)
)
)
);
client.Bulk(b => b
.IndexMany(new[] {
new Test { Text = "words words", Flag = "bar" },
new Test { Text = "words words", Flag = "foo" }
})
.Refresh(Refresh.WaitFor)
);
client.Count<Test>(s => s
.Query(q => q
.Match(m => m
.Field(p => p.Flag)
.Query("foo")
)
)
);
client.UpdateByQuery<Test>(u => u
.Query(q => q
.Term(f => f.Flag, "bar")
)
.Script("ctx._source.flag = 'foo'")
.Conflicts(Conflicts.Proceed)
.Refresh(true)
);
client.Count<Test>(s => s
.Query(q => q
.Match(m => m
.Field(p => p.Flag)
.Query("foo")
)
)
);
}
public class Test
{
public string Text { get; set; }
public string Flag { get; set; }
}
Observe that the count from the first Count API call is 1, and on the second Count API call after the Update By Query API call, it's 2.

ElasticSearch Nest 2.x Indexing and Searching Nested Objects

I'm having trouble figuring out how to index and search nested object.
I want to be able to search nested objects and return the parents - only the parents, without the list of Remarks, but I would like highlights from the remarks returned if possible.
My models:
[DataContract]
[ElasticsearchType(IdProperty = "CustomerId", Name = "CustomerSearchResult")]
public class SearchResult
{
[DataMember]
[String(Index = FieldIndexOption.NotAnalyzed)]
public int CustomerId { get; set; }
...
[Nested]
[DataMember]
public List<RemarkForSearch> Remarks { get; set; }
}
[ElasticsearchType(IdProperty = "RemarkId", Name = "RemarkForSearch")]
public class RemarkForSearch
{
[DataMember]
public int RemarkId { get; set; }
[DataMember]
public int CustomerId { get; set; }
[DataMember]
public string RemarkText { get; set; }
}
Index creation:
var customerSearchIdxDesc = new CreateIndexDescriptor(Constants.ElasticSearch.CustomerSearchIndexName)
.Settings(f =>
f.Analysis(analysis => analysis
.CharFilters(cf => cf
.PatternReplace(Constants.ElasticSearch.FilterNames.RemoveNonAlphaNumeric, pr => pr
.Pattern(#"[^a-zA-Z\d]") // match all non alpha numeric
.Replacement(string.Empty)
)
)
.TokenFilters(tf => tf
.NGram(Constants.ElasticSearch.FilterNames.NGramFilter, fs => fs
.MinGram(1)
.MaxGram(20)
)
)
.Analyzers(analyzers => analyzers
.Custom(Constants.ElasticSearch.AnalyzerNames.NGramAnalyzer, a => a
.Filters("lowercase", "asciifolding", Constants.ElasticSearch.FilterNames.NGramFilter)
.Tokenizer(Constants.ElasticSearch.TokenizerNames.WhitespaceTokenizer)
)
.Custom(Constants.ElasticSearch.AnalyzerNames.WhitespaceAnalyzer, a => a
.Filters("lowercase", "asciifolding")
.Tokenizer(Constants.ElasticSearch.TokenizerNames.WhitespaceTokenizer)
)
.Custom(Constants.ElasticSearch.AnalyzerNames.FuzzyAnalyzer, a => a
.Filters("lowercase", "asciifolding")
//.CharFilters(Constants.ElasticSearch.FilterNames.RemoveNonAlphaNumeric)
.Tokenizer(Constants.ElasticSearch.TokenizerNames.NGramTokenizer)
)
)
.Tokenizers(tokenizers => tokenizers
.NGram(Constants.ElasticSearch.TokenizerNames.NGramTokenizer, t => t
.MinGram(1)
.MaxGram(20)
//.TokenChars(TokenChar.Letter, TokenChar.Digit)
)
.Whitespace(Constants.ElasticSearch.TokenizerNames.WhitespaceTokenizer)
)
)
)
.Mappings(ms => ms
.Map<ServiceModel.DtoTypes.Customer.SearchResult>(m => m
.AutoMap()
.AllField(s => s
.Analyzer(Constants.ElasticSearch.AnalyzerNames.NGramAnalyzer)
.SearchAnalyzer(Constants.ElasticSearch.AnalyzerNames.WhitespaceAnalyzer)
)
.Properties(p => p
.String(n => n
.Name(c => c.ContactName)
.Index(FieldIndexOption.NotAnalyzed)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.String(n => n
.Name(c => c.CustomerName)
.Index(FieldIndexOption.NotAnalyzed)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.String(n => n
.Name(c => c.City)
.Index(FieldIndexOption.NotAnalyzed)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.String(n => n
.Name(c => c.StateAbbreviation)
.Index(FieldIndexOption.NotAnalyzed)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.String(n => n
.Name(c => c.PostalCode)
.Index(FieldIndexOption.NotAnalyzed)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.String(n => n
.Name(c => c.Country)
.Index(FieldIndexOption.NotAnalyzed)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.Number(n => n
.Name(c => c.AverageMonthlySales)
.Type(NumberType.Double)
.CopyTo(fs => fs.Field(Constants.ElasticSearch.CombinedSearchFieldName))
)
.String(n => n
.Name(Constants.ElasticSearch.CombinedSearchFieldName)
.Index(FieldIndexOption.Analyzed)
.Analyzer(Constants.ElasticSearch.AnalyzerNames.FuzzyAnalyzer)
.SearchAnalyzer(Constants.ElasticSearch.AnalyzerNames.FuzzyAnalyzer)
)
.Nested<ServiceModel.DtoTypes.Customer.RemarkForSearch>(s => s
.Name(n => n.Remarks)
.AutoMap()
)
)
)
);
var response = client.CreateIndex(customerSearchIdxDesc);
Loading the index:
var searchResults = Db.SqlList<DtoTypes.Customer.SearchResult>("EXEC [Customer].[RetrieveAllForSearch]");
var remarkResults = Db.SqlList<DtoTypes.Customer.RemarkForSearch>("EXEC [Customer].[RetrieveAllSearchableRemarks]");
foreach(var i in searchResults)
{
i.Remarks = remarkResults.Where(m => m.CustomerId == i.CustomerId).ToList();
}
var settings = new ConnectionSettings(Constants.ElasticSearch.Node);
var client = new ElasticClient(settings);
// Flush the index
var flushResponse = client.Flush(Constants.ElasticSearch.CustomerSearchIndexName);
// Refresh index
var indexResponse = client.IndexMany(searchResults, Constants.ElasticSearch.CustomerSearchIndexName);
Querying the Index:
var searchDescriptor = new SearchDescriptor<DtoTypes.Customer.SearchResult>()
.From(0)
.Take(Constants.ElasticSearch.MaxResults)
.Query(q => q
.Nested(c => c
.Path(p => p.Remarks)
.Query(nq => nq
.Match(m => m
.Query(query)
.Field("remarks.remarktext")
)
)
)
);
response = client.Search<DtoTypes.Customer.SearchResult>(searchDescriptor);
I don't know if I'm bulk loading the index properly and if its smart enough to know that the Remarks property is a nested property and to load those as well.
The search has no errors, but I get no results.
The search query is generating this json, which from what I can tell is OK:
{
"from": 0,
"size": 100,
"query": {
"nested": {
"query": {
"match": {
"remarks.remarktext": {
"query": "test"
}
}
},
"path": "remarks"
}
}
}
I do see the remark data when looking at json using a query string http://127.0.0.1:9200/customersearch/_search
I want to be able to search nested objects and return the parents -
only the parents, without the list of Remarks, but I would like
highlights from the remarks returned if possible.
What about this idea. Let's exclude nested object from source but leave highlight on nested field in place. What I mean.
public class Document
{
public int Id { get; set; }
[Nested]
public Nested Nested { get; set; }
}
var createIndexResponse = client.CreateIndex(indexName, descriptor => descriptor
.Mappings(map => map
.Map<Document>(m => m
.AutoMap()
)));
var items = new List<Document>
{
new Document
{
Id = 1,
Nested = new Nested {Name = "Robert" }
},
new Document
{
Id = 2,
Nested = new Nested {Name = "Someone" }
}
};
var bulkResponse = client.IndexMany(items);
client.Refresh(indexName);
var searchResponse = client.Search<Document>(s => s
.Source(so => so.Exclude(e => e.Field(f => f.Nested)))
.Highlight(h => h.Fields(f => f.Field("nested.name")).PostTags("<b>").PreTags("<b>"))
.Query(q => q
.Nested(n => n
.Path(p => p.Nested)
.Query(nq => nq.Match(m => m
.Query("Robert").Field("nested.name"))))));
And what elasticsearch returns is
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [{
"_index" : "my_index",
"_type" : "document",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"id" : 1
},
"highlight" : {
"nested.name" : ["<a>Robert<a>"]
}
}
]
}
}
What do you think?

Combining queries using bool in Nest Elasticsearch

I need to get the documents from ES using NEST client with multiple And/OR conditions on two fields.
My query is as:
SELECT * FROM Document WHERE (Year!=2012 && Year!=2013 ) AND (Format=".pdf" || Format=".prt" || Format=".jpeg")
below is my code:
var qc = new List<QueryContainer>();
foreach (var year in years)// years is the list of years that must not be included
{
qc.Add(Query<Document>.Match(m => m.OnField(p => p.Year).Query(year)));
}
var qF = new List<QueryContainer>();
foreach (var format in txtDocs)// txtDocs is the list of formats that should be included if available
{
qF.Add(Query<Document>.Match(m => m.OnField(p => p.Format).Query(format)));
}
var searchResults = client.Search<Document>(s => s.Index(defaultIndex).From(0).Size(50).
Query(
f => f.Bool(
b => b
.MustNot(qc.ToArray()).Should(qF.ToArray()))));
When I try this code it works for the years that must not appear in the results but for the formats that should be selected by user, it doesn't show those selected formats although they are available.
I also used "must" instead of "should", but then it does not retrieve anything at all.
Has anyone had such a similar problem?
public class Test
{
public int Year { get; set; }
[ElasticProperty(Index = FieldIndexOption.NotAnalyzed)]
public string Format { get; set; }
}
var searchResponse = client.Search<Test>(s => s.Query(q => q
.Bool(b => b
.MustNot(
m => m.Term(t => t.OnField(f => f.Year).Value(2012)),
m => m.Term(t => t.OnField(f => f.Year).Value(2013))
)
.Should(
should => should.Term(t => t.OnField(f => f.Format).Value(".pdf")),
should => should.Term(t => t.OnField(f => f.Format).Value(".prt")),
should => should.Term(t => t.OnField(f => f.Format).Value(".jpeg"))
)
)));
Hope it helps.
Here is the code for making a dynamic query:
QueryContainer qYear=null;
foreach (var year in years)
{
qYear |= new TermQuery() { Field = "year", Value = year };
}
QueryContainer qDoc=null;
foreach (var format in txtDocs)
{
qDoc|=new TermQuery() {Field="format", Value= format};
}
var searchResults = client.Search<Document>(s => s.Index(defaultIndex).From(0).Size(50).
Query(q => q.Bool(b => b.Should(qDoc).MustNot(qYear))));

Resources