how to connect hdfs ha for elasticsearch snapshot? - elasticsearch

I want to automatic connect to hdfs ha when namenode switch active to standby, which uri should be ?
PUT _snapshot/my_hdfs_repository
{
"type": "hdfs",
"settings": {
"uri": "hdfs://namenode:8020/",
"path": "/user/elasticsearch/repositories"
}
}
till now, I manual change the uri when hdfs namenode switch

This is my setting with ha hdfs and kerberos enabled.
PUT /_snapshot/elastic_hdfs_repository
{
"type" : "hdfs",
"settings" : {
"dfs" : {
"http" : {
"policy" : "HTTPS_ONLY"
}
},
"path" : "/elasticsearch/repositories/elastic_hdfs_repository",
"conf" : {
"dfs" : {
"client" : {
"failover" : {
"proxy" : {
"provider" : {
"my-cluster-nameservice" : "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
}
}
}
},
"ha" : {
"automatic-failover" : {
"enabled" : {
"my-cluster-nameservice" : "true"
}
},
"namenodes" : {
"my-cluster-nameservice" : "namenode1,namenode2"
}
},
"data" : {
"transfer" : {
"protection" : "privacy"
}
},
"namenode" : {
"rpc-address" : {
"my-cluster-nameservice" : {
"namenode1" : "nn1.domain.com:8020",
"namenode2" : "nn2.domain.com:8020"
}
}
},
"nameservices" : "my-cluster-nameservice"
},
"fs" : {
"defaultFS" : "hdfs://elastic_hdfs_repository",
"hdfs" : {
"impl" : "org.apache.hadoop.hdfs.DistributedFileSystem"
}
},
"hadoop.http.authentication.token.validity": 36000
},
"security" : {
"principal" : "elasticsearch/_HOST#DOMAIN.COM"
},
"uri" : "hdfs://my-cluster-nameservice"
}
}

Related

Elasticsearch ILM not rolling

I have configured my ILM to rollover when the indice size be 20GB or after passing 30 days in the hot node
but my indice passed 20GB and still didn't pass to the cold node
and when I run: GET _cat/indices?v I get:
green open packetbeat-7.9.2-2020.10.22-000001 RRAnRZrrRZiihscJ3bymig 10 1 63833049 0 44.1gb 22gb
Could you tell me how to solve that please !
Knowing that in my packetbeat file configuration, I have just changed the number of shards:
setup.template.settings:
index.number_of_shards: 10
index.number_of_replicas: 1
when I run the command GET packetbeat-7.9.2-2020.10.22-000001/_settings I get this output:
{
"packetbeat-7.9.2-2020.10.22-000001" : {
"settings" : {
"index" : {
"lifecycle" : {
"name" : "packetbeat",
"rollover_alias" : "packetbeat-7.9.2"
},
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"mapping" : {
"total_fields" : {
"limit" : "10000"
}
},
"refresh_interval" : "5s",
"number_of_shards" : "10",
"provided_name" : "<packetbeat-7.9.2-{now/d}-000001>",
"max_docvalue_fields_search" : "200",
"query" : {
"default_field" : [
"message",
"tags",
"agent.ephemeral_id",
"agent.id",
"agent.name",
"agent.type",
"agent.version",
"as.organization.name",
"client.address",
"client.as.organization.name",
and the output of the command GET /packetbeat-7.9.2-2020.10.22-000001/_ilm/explain is :
{
"indices" : {
"packetbeat-7.9.2-2020.10.22-000001" : {
"index" : "packetbeat-7.9.2-2020.10.22-000001",
"managed" : true,
"policy" : "packetbeat",
"lifecycle_date_millis" : 1603359683835,
"age" : "15.04d",
"phase" : "hot",
"phase_time_millis" : 1603359684332,
"action" : "rollover",
"action_time_millis" : 1603360173138,
"step" : "check-rollover-ready",
"step_time_millis" : 1603360173138,
"phase_execution" : {
"policy" : "packetbeat",
"phase_definition" : {
"min_age" : "0ms",
"actions" : {
"rollover" : {
"max_size" : "50gb",
"max_age" : "30d"
}
}
},
"version" : 1,
"modified_date_in_millis" : 1603359683339
}
}
}
}
It's weird that it's 50GB !!
Thanks for your help
So I found the solution of this problem.
After updating the policy, I removed the policy from the index using it, and then added it again to those index.

elasticsearch - moving from multi servers to one server

I have a cluster of 5 servers for elasticsearch, all with the same version of elasticsearch.
I need to move all data from servers 2, 3, 4, 5 to server 1.
How can I do it?
How can I know which server has data at all?
After change of _cluster/settings with:
PUT _cluster/settings
{
"persistent" : {
"cluster.routing.allocation.require._host" : "server1"
}
}
I get for: curl -GET http://localhost:9200/_cat/allocation?v
the following:
shards disk.indices disk.used disk.avail disk.total disk.percent host ip node
6 54.5gb 170.1gb 1.9tb 2.1tb 7 *.*.*.* *.*.*.* node-5
6 50.4gb 167.4gb 1.9tb 2.1tb 7 *.*.*.* *.*.*.* node-3
6 22.6gb 139.8gb 2tb 2.1tb 6 *.*.*.* *.*.*.* node-2
6 49.8gb 166.6gb 1.9tb 2.1tb 7 *.*.*.* *.*.*.* node-4
6 54.8gb 172.1gb 1.9tb 2.1tb 7 *.*.*.* *.*.*.* node-1
and for: GET _cluster/settings?include_defaults
the following:
#! Deprecation: [node.max_local_storage_nodes] setting was deprecated in Elasticsearch and will be removed in a future release!
{
"persistent" : {
"cluster" : {
"routing" : {
"allocation" : {
"require" : {
"_host" : "server1"
}
}
}
}
},
"transient" : { },
"defaults" : {
"cluster" : {
"max_voting_config_exclusions" : "10",
"auto_shrink_voting_configuration" : "true",
"election" : {
"duration" : "500ms",
"initial_timeout" : "100ms",
"max_timeout" : "10s",
"back_off_time" : "100ms",
"strategy" : "supports_voting_only"
},
"no_master_block" : "write",
"persistent_tasks" : {
"allocation" : {
"enable" : "all",
"recheck_interval" : "30s"
}
},
"blocks" : {
"read_only_allow_delete" : "false",
"read_only" : "false"
},
"remote" : {
"node" : {
"attr" : ""
},
"initial_connect_timeout" : "30s",
"connect" : "true",
"connections_per_cluster" : "3"
},
"follower_lag" : {
"timeout" : "90000ms"
},
"routing" : {
"use_adaptive_replica_selection" : "true",
"rebalance" : {
"enable" : "all"
},
"allocation" : {
"node_concurrent_incoming_recoveries" : "2",
"node_initial_primaries_recoveries" : "4",
"same_shard" : {
"host" : "false"
},
"total_shards_per_node" : "-1",
"shard_state" : {
"reroute" : {
"priority" : "NORMAL"
}
},
"type" : "balanced",
"disk" : {
"threshold_enabled" : "true",
"watermark" : {
"low" : "85%",
"flood_stage" : "95%",
"high" : "90%"
},
"include_relocations" : "true",
"reroute_interval" : "60s"
},
"awareness" : {
"attributes" : [ ]
},
"balance" : {
"index" : "0.55",
"threshold" : "1.0",
"shard" : "0.45"
},
"enable" : "all",
"node_concurrent_outgoing_recoveries" : "2",
"allow_rebalance" : "indices_all_active",
"cluster_concurrent_rebalance" : "2",
"node_concurrent_recoveries" : "2"
}
},
...
"nodes" : {
"reconnect_interval" : "10s"
},
"service" : {
"slow_master_task_logging_threshold" : "10s",
"slow_task_logging_threshold" : "30s"
},
...
"name" : "cluster01",
...
"max_shards_per_node" : "1000",
"initial_master_nodes" : [ ],
"info" : {
"update" : {
"interval" : "30s",
"timeout" : "15s"
}
}
},
...
You can use shard allocation filtering to move all your data to server 1.
Simply run this:
PUT _cluster/settings
{
"persistent" : {
"cluster.routing.allocation.require._name" : "node-1",
"cluster.routing.allocation.exclude._name" : "node-2,node-3,node-4,node-5"
}
}
Instead of _name you can also use _ip or _host depending on what is more practical for you.
After running this command, all primary shards will migrate to server1 (the replicas will be unassigned). You just need to make sure that server1 has enough storage space to store all the primary shards.
If you want to get rid of the unassigned replicas (and get back to green state), simply run this:
PUT _all/_settings
{
"index" : {
"number_of_replicas" : 0
}
}

How project DBRef on Spring MongoDB Aggregation?

I have the following aggregation done in a MongoDB shell to get the number of alerts of each type for each user:
db.getCollection('alerts').aggregate(
{
$unwind:"$son"
},
{
$group:
{
_id:{
son: "$son",
level: "$level"
},
count: { $sum: 1 }
}
},
{
$group:
{
_id:{
son: "$_id.son"
},
alerts: { $addToSet: {
level: "$_id.level",
count: "$count"
}}
}
}
)
I have translated it to Spring Data MongoDB as follows:
TypedAggregation<AlertEntity> alertsAggregation =
Aggregation.newAggregation(AlertEntity.class,
unwind("$son"),
Aggregation.group("$son", "$level").count().as("count"),
Aggregation.group("$_id.son")
.addToSet(new BasicDBObject("level", "$_id.level").append("count", "$count")).as("alerts"));
// Aggregation.match(Criteria.where("_id").in(sonIds)
AggregationResults<AlertsBySonDTO> results = mongoTemplate.
aggregate(alertsAggregation, AlertsBySonDTO.class);
List<AlertsBySonDTO> alertsBySonResultsList = results.getMappedResults();
return alertsBySonResultsList;
What I have not clear and I can not get it to work, is to project the identifier and if possible the name of the user (son variable).
The resulting DTO is as follows
public final class AlertsBySonDTO implements Serializable {
private static final long serialVersionUID = 1L;
#JsonProperty("identity")
private String id;
#JsonProperty("alerts")
private ArrayList<Map<String, String>> alerts;
}
but in the id property the entire embedded child entity.
This is the structure of the collection of alerts.
JSON alerts format:
{
"_id" : ObjectId("59e6ff3d9ef9d46a91112890"),
"_class" : "es.bisite.usal.bulltect.persistence.entity.AlertEntity",
"level" : "INFO",
"title" : "Alerta de Prueba",
"payload" : "Alerta de Prueba",
"create_at" : ISODate("2017-10-18T07:13:45.091Z"),
"delivery_mode" : "PUSH_NOTIFICATION",
"delivered" : false,
"parent" : {
"$ref" : "parents",
"$id" : ObjectId("59e6ff369ef9d46a91112878")
},
"son" : {
"$ref" : "children",
"$id" : ObjectId("59e6ff389ef9d46a9111287b")
}
}
/* 2 */
{
"_id" : ObjectId("59e6ff6d9ef9d46a91112892"),
"_class" : "es.bisite.usal.bulltect.persistence.entity.AlertEntity",
"level" : "WARNING",
"title" : "Token de acceso inv�lido.",
"payload" : "El token de acceso YOUTUBE no es v�lido",
"create_at" : ISODate("2017-10-18T07:14:53.449Z"),
"delivery_mode" : "PUSH_NOTIFICATION",
"delivered" : false,
"parent" : {
"$ref" : "parents",
"$id" : ObjectId("59e6ff369ef9d46a91112878")
},
"son" : {
"$ref" : "children",
"$id" : ObjectId("59e6ff389ef9d46a9111287b")
}
}
/* 3 */
{
"_id" : ObjectId("59e6ff6d9ef9d46a91112893"),
"_class" : "es.bisite.usal.bulltect.persistence.entity.AlertEntity",
"level" : "WARNING",
"title" : "Token de acceso inv�lido.",
"payload" : "El token de acceso INSTAGRAM no es v�lido",
"create_at" : ISODate("2017-10-18T07:14:53.468Z"),
"delivery_mode" : "PUSH_NOTIFICATION",
"delivered" : false,
"parent" : {
"$ref" : "parents",
"$id" : ObjectId("59e6ff369ef9d46a91112878")
},
"son" : {
"$ref" : "children",
"$id" : ObjectId("59e6ff389ef9d46a9111287c")
}
}
Anyone know how I can approach this?
thanks in advance
1. With MongoDB version 3.4
These are the following collections I created to reproduce your use case:
Alerts Collection
{
"_id" : ObjectId("59e6ff3d9ef9d46a91112890"),
"_class" : "es.bisite.usal.bulltect.persistence.entity.AlertEntity",
"level" : "INFO",
"title" : "Alerta de Prueba",
"payload" : "Alerta de Prueba",
"create_at" : ISODate("2017-10-18T07:13:45.091+0000"),
"delivery_mode" : "PUSH_NOTIFICATION",
"delivered" : false,
"parent" : DBRef("parents", ObjectId("59e6ff369ef9d46a91112878")),
"son" : DBRef("children", ObjectId("59e72ff0572ae72d8c063666"))
}
{
"_id" : ObjectId("59e6ff6d9ef9d46a91112892"),
"_class" : "es.bisite.usal.bulltect.persistence.entity.AlertEntity",
"level" : "WARNING",
"title" : "Token de acceso inv�lido.",
"payload" : "El token de acceso YOUTUBE no es valido",
"create_at" : ISODate("2017-10-18T07:14:53.449+0000"),
"delivery_mode" : "PUSH_NOTIFICATION",
"delivered" : false,
"parent" : DBRef("parents", ObjectId("59e6ff369ef9d46a91112878")),
"son" : DBRef("children", ObjectId("59e72ff0572ae72d8c063666"))
}
{
"_id" : ObjectId("59e6ff6d9ef9d46a91112893"),
"_class" : "es.bisite.usal.bulltect.persistence.entity.AlertEntity",
"level" : "WARNING",
"title" : "Token de acceso inv�lido.",
"payload" : "El token de acceso INSTAGRAM no es v�lido",
"create_at" : ISODate("2017-10-18T07:14:53.468+0000"),
"delivery_mode" : "PUSH_NOTIFICATION",
"delivered" : false,
"parent" : DBRef("parents", ObjectId("59e6ff369ef9d46a91112878")),
"son" : DBRef("children", ObjectId("59e72ffb572ae72d8c063669"))
}
Notice I changed the OBjectIds of the sons reference to match the children collection I created.
Children collection
{
"_id" : ObjectId("59e72ff0572ae72d8c063666"),
"name" : "Bob"
}
{
"_id" : ObjectId("59e72ffb572ae72d8c063669"),
"name" : "Tim"
}
Since you are using a reference you can't just access a field from the other collection. So I think you are missing some aggregation steps.
I did the following:
db.getCollection('alerts').aggregate(
{
$unwind:"$son"
},
{
$group:
{
_id:{
son: "$son",
level: "$level"
},
count: { $sum: 1 }
}
},
{
$group:
{
_id:{
son: "$_id.son"
},
alerts: { $addToSet: {
level: "$_id.level",
count: "$count"
}}
}
},
{ $addFields: { sonsArray: { $objectToArray: "$_id.son" } } },
{ $match: { "sonsArray.k": "$id"} },
{ $lookup: { from: "children", localField: "sonsArray.v", foreignField: "_id", as: "name" } }
)
And got the following results as json:
{
"_id" : {
"son" : DBRef("children", ObjectId("59e72ffb572ae72d8c063669"))
},
"alerts" : [
{
"level" : "WARNING",
"count" : NumberInt(1)
}
],
"sonsArray" : [
{
"k" : "$ref",
"v" : "children"
},
{
"k" : "$id",
"v" : ObjectId("59e72ffb572ae72d8c063669")
}
],
"name" : [
{
"_id" : ObjectId("59e72ffb572ae72d8c063669"),
"name" : "Tim"
}
]
}
{
"_id" : {
"son" : DBRef("children", ObjectId("59e72ff0572ae72d8c063666"))
},
"alerts" : [
{
"level" : "INFO",
"count" : NumberInt(1)
},
{
"level" : "WARNING",
"count" : NumberInt(1)
}
],
"sonsArray" : [
{
"k" : "$ref",
"v" : "children"
},
{
"k" : "$id",
"v" : ObjectId("59e72ff0572ae72d8c063666")
}
],
"name" : [
{
"_id" : ObjectId("59e72ff0572ae72d8c063666"),
"name" : "Bob"
}
]
}
If you want to get rid of the fields that where additionally created like sonsArray etc. you can do add a $project pipeline to clean your result.
2. If you have older versions of mongodb and you can change your data structure.
If instead of using a reference like this:
"son" : DBRef("children", ObjectId("59e72ffb572ae72d8c063669"))
you can add the objectId of the son/s as an array like this:
"sonId" : [
ObjectId("59e72ff0572ae72d8c063666")
]
then you can do your aggregation as follows:
db.getCollection('alerts').aggregate(
{
$unwind:"$sonId"
},
{
$group:
{
_id:{
sonId: "$sonId",
level: "$level"
},
count: { $sum: 1 }
}
},
{
$group:
{
_id:{
sonId: "$_id.sonId"
},
alerts: { $addToSet: {
level: "$_id.level",
count: "$count"
}}
}
},
{ $lookup: { from: "children", localField: "_id.sonId", foreignField: "_id", as: "son" } }
)
Is that something you are looking for?

Having trouble with a slow MongoDB aggregation query

I have the following aggregation query in MongoDB
return mongoose.model('Submission')
.aggregate([
{ $match: { client: { $in: clientIds }, admin: this._admin._id } },
{ $sort: { client: 1, submitted: -1 } },
{ $group: {
_id: '$client',
lastSubmitted: { $first: '$submitted' },
timezone: { $first: '$timezone' },
} },
])
.exec();
which is performing really badly on a collection with about 2000 documents. It usually takes 5 seconds to complete and I've seen as bad as 15 seconds. I have the following index on the submissions collection:
{
client : 1,
admin : 1,
assessment : 1,
submitted : -1,
}
I'm stuck as to why it's taking so long. Any suggestions?
EDIT
I've run the query
db.submissions.aggregate([
{$match: {
client: {$in: ['54a4cdfdd0666c243035dc98','55cc985291a0ffab6849de34']},
admin: '542b4af8880fc300007eb411'
}},
{$sort: {client:1, submitted: -1}},
{$group: {
_id: '$client',
lastSubmitted: {$first: '$submitted'},
timezone: {$first: '$timezone'}
}}
], {explain: true})
in the shell with explain and got
{
"stages" : [
{
"$cursor" : {
"query" : {
"client" : {
"$in" : [
"54a4cdfdd0666c243035dc98",
"55cc985291a0ffab6849de34"
]
},
"admin" : "542b4af8880fc300007eb411"
},
"fields" : {
"client" : 1,
"submitted" : 1,
"timezone" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "webdemo.submissions",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"admin" : {
"$eq" : "542b4af8880fc300007eb411"
}
},
{
"client" : {
"$in" : [
"54a4cdfdd0666c243035dc98",
"55cc985291a0ffab6849de34"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"client" : 1,
"admin" : 1,
"assessment" : 1,
"submitted" : -1
},
"indexName" : "client_1_admin_1_assessment_1_submitted_-1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"client" : [
"[\"54a4cdfdd0666c243035dc98\", \"54a4cdfdd0666c243035dc98\"]",
"[\"55cc985291a0ffab6849de34\", \"55cc985291a0ffab6849de34\"]"
],
"admin" : [
"[\"542b4af8880fc300007eb411\", \"542b4af8880fc300007eb411\"]"
],
"assessment" : [
"[MinKey, MaxKey]"
],
"submitted" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$sort" : {
"sortKey" : {
"client" : 1,
"submitted" : -1
}
}
},
{
"$group" : {
"_id" : "$client",
"lastSubmitted" : {
"$first" : "$submitted"
},
"timezone" : {
"$first" : "$timezone"
}
}
}
],
"ok" : 1
}
EDIT 2
The output I get from db.submissions.getIndices() is
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "webdemo.submissions"
},
{
"v" : 1,
"key" : {
"client" : 1,
"admin" : 1,
"assessment" : 1,
"submitted" : -1
},
"name" : "client_1_admin_1_assessment_1_submitted_-1",
"ns" : "webdemo.submissions",
"background" : true
}
]

Specify Routing on Index Alias's Term Lookup Filter

I am using Logstash, ElasticSearch and Kibana to allow multiple users to log in and view the log data they have forwarded. I have created index aliases for each user. These restrict their results to contain only their own data.
I'd like to assign users to groups, and allow users to view data for the computers in their group. I created a parent-child relationship between the groups and the users, and I created a term lookup filter on the alias.
My problem is, I receive a RoutingMissingException when I try to apply the alias.
Is there a way to specify the routing for the term lookup filter? How can I lookup terms on a parent document?
I posted the mapping and alias below, but a full gist recreation is available at this link.
curl -XPUT 'http://localhost:9200/accesscontrol/' -d '{
"mappings" : {
"group" : {
"properties" : {
"name" : { "type" : "string" },
"hosts" : { "type" : "string" }
}
},
"user" : {
"_parent" : { "type" : "group" },
"_routing" : { "required" : true, "path" : "group_id" },
"properties" : {
"name" : { "type" : "string" },
"group_id" : { "type" : "string" }
}
}
}
}'
# Create the logstash alias for cvializ
curl -XPOST 'http://localhost:9200/_aliases' -d '
{
"actions" : [
{ "remove" : { "index" : "logstash-2014.04.25", "alias" : "cvializ-logstash-2014.04.25" } },
{
"add" : {
"index" : "logstash-2014.04.25",
"alias" : "cvializ-logstash-2014.04.25",
"routing" : "intern",
"filter": {
"terms" : {
"host" : {
"index" : "accesscontrol",
"type" : "user",
"id" : "cvializ",
"path" : "group.hosts"
},
"_cache_key" : "cvializ_hosts"
}
}
}
}
]
}'
In attempting to find a workaround for this error, I submitted a bug to the ElasticSearch team, and received an answer from them. It was a bug in ElasticSearch where the filter is applied before the dynamic mapping, causing some erroneous output. I've included their workaround below:
PUT /accesscontrol/group/admin
{
"name" : "admin",
"hosts" : ["computer1","computer2","computer3"]
}
PUT /_template/admin_group
{
"template" : "logstash-*",
"aliases" : {
"template-admin-{index}" : {
"filter" : {
"terms" : {
"host" : {
"index" : "accesscontrol",
"type" : "group",
"id" : "admin",
"path" : "hosts"
}
}
}
}
},
"mappings": {
"example" : {
"properties": {
"host" : {
"type" : "string"
}
}
}
}
}
POST /logstash-2014.05.09/example/1
{
"message":"my sample data",
"#version":"1",
"#timestamp":"2014-05-09T16:25:45.613Z",
"type":"example",
"host":"computer1"
}
GET /template-admin-logstash-2014.05.09/_search

Resources