Orientdb Slow import large dataset how to make it faster? - performance

I'm working on a network of 17M edges and 20K vrtices , and I'm loading it into Orientdb using ETL tool but it is taking forever to load.
I tried the batch varying from 1000 to 100000 , yet still no change .
Is there an optimized way to make it load faster ? other tha using Java API
Any help would be appreciated.
I'm using 2.2.20 community version .
Here is the ETL fo import :
{
"source": { "file": { "path": "C:/Users/Muuna/Desktop/files/H.csv" } },
"extractor": { "csv": {
"separator": ",",
"columnsOnFirstLine": true,
"ignoreEmptyLines": true,
"columns": ["id:Integer","p1:String","p2:String","s:Integer"] } },
"transformers": [
{ "command": { "command": "UPDATE H set p='${input.p1}' UPSERT WHERE p='${input.p1}'"},"vertex": { "class": "H", "skipDuplicates": true} }
],
"loader": {
"orientdb": {
"dbURL": "PLOCAL:C:/orientdb/databases/Graph",
"dbUser": "admin",
"dbPassword": "admin",
"dbType": "graph",
"classes": [
{"name": "H", "extends": "V"},
{"name": "HAS_S", "extends": "E"}
],"indexes": [ {"class":"H", "fields":["p:String"], "type":"UNIQUE" }]
}
}
}
Based on [1]: orientdb load graph csv of nodes and edges
The same script is loaded twice to import the 2 vertices and another ETL for loading the edges .
Edges .
Based on [Ref][1]
{
"source": { "file": { "path": "C:/Users/Muuna/Desktop/files/H.csv" } },
"extractor": { "csv": {
"separator": ",",
"columnsOnFirstLine": true,
"ignoreEmptyLines": true,
"columns": ["id:Integer","p1:String","p2:String","s:Integer"] } },
"transformers": [
{ "command": { "command": "CREATE EDGE HAS_S FROM (SELECT FROM H WHERE p='${input.p1}') TO (SELECT FROM H WHERE p='${input.p2}') set score=${input.s}"} }
],
"loader": {
"orientdb": {
"dbURL": "PLOCAL:C:/orientdb/databases/Graph",
"dbUser": "admin",
"dbPassword": "admin",
"dbType": "graph",
"classes": [
{"name": "H", "extends": "V"},
{"name": "HAS_S", "extends": "E"}
],"indexes": [ {"class":"H", "fields":["p:String"], "type":"UNIQUE" }]
}
}
}

Related

Elastic \ Opensearch life cycle management - what is the difference between read_write & open actions

I want to use life cycle management, the goal is to delete messages after 14 days
What should be the action in the first stage? Open or Read_write
What is the difference between the two actions?
{
"policy": {
"policy_id": "delete_after14_days",
"description": "index delete"
"schema_version": 1,
"error_notification": null,
"default_state": "open",
"states": [
{
"name": "hot",
"actions": [
{
**"open": {} or "read_write": {}**
}
],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "14d"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
],
"ism_template": [
{
"index_patterns": [
"audit-*"
],
"priority": 0
}
]
}
}

rethinkdb python eqJoin with filter not working

I have two tables records
[
{
"payload": {
"uuid": "123",
"version": "1.0.0"
},
"record_id":"rec-123"
},
{
"payload": {
"uuid": "456",
"version": "1.0.1"
},
"record_id":"rec-456"
}
]
and records_master
[
{
"version": {
"mode": 1
},
"id": "rec-123",
"title": "Test-123"
},
{
"version": {
"mode": 0
},
"title": "Test-456",
"id": "rec-456"
}
]
I want to get records for matching payload.uuid and payload.version in records and version.mode should be 1 in records_master.
data = r.table("records").filter(
lambda doc:
(doc["payload"]["uuid"]== <some_value>) &
(doc["payload"]["version"]== <some_version>)
).coerce_to("array")
.run(connection)
This gives me all records for a matching payload.uuid and payload.version .
But unable to make it work for eqJoin with records-master
data = r.table('records').eq_join('record_id', r.table('records-master')).filter(
lambda doc,master:
(doc["payload"]["uuid"]== <some-value>) &
(doc["payload"]["version"]== <some-version>) &
(master["version"]["mode"] ==1)
).run(connection)
This query worked for me
data = r.table("records").filter(
lambda doc:
(doc["payload"]["uuid"]== <some_value>) &
(doc["payload"]["version"]== <some_version>)
).eq_join('record_id',
r.table('records-master')
)\
.filter(lambda master:(master["right"]["version"]["mode"]== 1))\
.coerce_to("array")\
.run(connection)

NiFi: ReplaceText alternatives to modify JSON

My NiFi application receives two kinda different types of JSON's.
First of them looks like:
[
{
"campaign": {
"resourceName": "customers/8952771329/campaigns/11381694617",
"status": "ENABLED",
"name": "Saint_Spring_Active Minerals_oct-nov_2020_trueview_skip_5766500views",
"id": "11381694617"
},
"metrics": {
"interactionEventTypes": [
"VIDEO_VIEW"
],
"clicks": "6",
"videoQuartileP100Rate": 0.44493171079034244,
"videoQuartileP25Rate": 0.9747718298919024,
"videoQuartileP50Rate": 0.7339309987701469,
"videoQuartileP75Rate": 0.5337562301767105,
"videoViewRate": 0.4471109114825628,
"videoViews": "27872",
"viewThroughConversions": "0",
"contentBudgetLostImpressionShare": 0.0000013066088274492382,
"contentImpressionShare": 0.0999,
"contentRankLostImpressionShare": 0.9001,
"conversionsValue": 0,
"conversions": 0,
"costMicros": "9338700950",
"ctr": 0.00009624947864865732,
"currentModelAttributedConversions": 0,
"currentModelAttributedConversionsValue": 0,
"engagementRate": 0,
"engagements": "0",
},
"segments": {
"device": "CONNECTED_TV",
"date": "2020-12-20"
}
}
]
And second:
[
{
"adGroup": {
"resourceName": "customers/5404177717/adGroups/110501283582",
"campaign": "customers/5404177717/campaigns/11628802542"
},
"metrics": {
"interactionEventTypes": [
"CLICK"
],
"clicks": "1",
"averageCpm": 95497428.02172929,
"gmailForwards": "0",
"gmailSaves": "0",
"gmailSecondaryClicks": "0",
"impressions": "4418",
"interactionRate": 0.00022634676324128565,
"interactions": "1"
},
"adGroupAd": {
"resourceName": "customers/5404177717/adGroupAds/110501283582~480227690139",
"status": "ENABLED",
"ad": {
"resourceName": "customers/5404177717/ads/480227690139",
"id": "480227690139",
"name": "20 sec perek"
},
"adGroup": "customers/5404177717/adGroups/110501283582"
},
"segments": {
"device": "DESKTOP",
"date": "2020-11-21"
}
}
]
I already have 2 tables in my database to save this data. I have an attribute table.name just to not create same block where's only table name is different.
My next block is FlattenJson. After this i'm using ReplaceText with search value (replacement value is empty string): (customers\\\/${client.customer.id}\\\/campaigns\\\/|customers\\\/${client.customer.id}\\\/adGroups\\\/).
Why this? From this line: "adGroup": "customers/5404177717/adGroups/110501283582" i only need last value 110501283582 as ad_group_id. And from this line: "campaign": "customers/5404177717/campaigns/11628802542" i only need 11628802542. ${client.customer.id} can be different, so i'm using EL features.
Also i need to change json value name adGroup to ad.group.id, for this i'm also using ReplaceText.
Can i do it faster without two ReplaceText processors?
Look at the following processors...I think using them can be an alternative:
JoltTransformJSON:
https://nifi.apache.org/docs/nifi-docs/components/org.apache.nifi/nifi-standard-nar/1.5.0/org.apache.nifi.processors.standard.JoltTransformJSON/
UpdateRecord:
https://nifi.apache.org/docs/nifi-docs/components/org.apache.nifi/nifi-standard-nar/1.5.0/org.apache.nifi.processors.standard.UpdateRecord/index.html

How to search key by passing value in json_query Ansible

I am calling API and getting below output but from the output and i want to find the key based on value input and my input value is "vpc-tz" how to achieve this in ansible using json_query?
{
"json": {
"allScopes": [
{
"
"clusters": {
"clusters": [
{
"cluster": {
"clientHandle": "",
"type": {
"name": "ClusterComputeResource"
},
"universalRevision": 0,
"vsmUuid": "423B1819-9495-4F10-A96A-6D8284E51B29"
}
}
]
},
"controlPlaneMode": "UNICAST_MODE",
"description": "",
"extendedAttributes": [
],
"id": "vdnscope-6",
"isTemporal": false,
"isUniversal": false,
"name": "vpc-tz",
"nodeId": "85e0073d-0e5a-4f04-889b-42df771aebf8",
"objectId": "vdnscope-6",
"objectTypeName": "VdnScope",
"revision": 0,
"type": {
"name": "VdnScope"
},
"universalRevision": 0,
"virtualWireCount": 0,
"vsmUuid": "423B1819-9495-4F10-A96A-6D8284E51B29"
},
]
}
}
Here is a query which works:
json.allScopes[?name=='vpc-tz'].name

NLog: LayoutRenderer cannot be found: 'aspnet-user-identity

I try to implement NLog into my .NET Core Api web service.
I want to log to an Oracle database. All works well through an nlog.config XML file.
But the goal is to implement NLog config into appsettings.json and here problem occurs.
I get the error set in title:
LayoutRenderer cannot be found: 'aspnet-user-identity
My config file is like this :
"NLog": {
"autoReload": true,
"throwConfigExceptions": true,
"internalLogLevel": "info",
"internalLogFile": "c:/app/log/dev/internal-appsetting-nlog.txt",
"extensions": {
"NLog.Extensions.Logging": {
"assembly": [
"NLog.Extensions.Logging",
"NLog.Web.AspNetCore"
]
}
},
"variables": {
"var_logdir": "c:/app/log/dev"
},
"default-wrapper": {
"type": "AsyncWrapper",
"overflowAction": "Block"
},
"targets": {
"all-file": {
"type": "File",
"fileName": "${var_logdir}/nlog-all-${shortdate}.log",
"layout": {
"type": "JsonLayout",
"Attributes": [
{
"name": "timestamp",
"layout": "${date:format=o}"
},
{
"name": "level",
"layout": "${level}"
},
{
"name": "logger",
"layout": "${logger}"
},
{
"name": "message",
"layout": "${message:raw=true}"
},
{
"name": "properties",
"encode": false,
"layout": {
"type": "JsonLayout",
"includeallproperties": "true"
}
}
]
}
},
"db": {
"type": "Database",
"commandText": "INSERT INTO logtable (LOGLEVEL,LOGGER,MESSAGE,MACHINENAME,USERNAME,CALLSITE, THREADID,EXCEPTIONMESSAGE,STACKTRACE,SESSIONID) VALUES (:pLEVEL,:pLOGGER,:pMESSAGE,:pMACHINENAME, :pCALLSITE,:pTHREADID,:pEXCEPTIONMESSAGE,:pSTACKTRACE)",
"parameters": [
{
"name": "#pLEVEL",
"layout": "${level}"
},
{
"name": "#pLOGGER",
"layout": "${logger}"
},
{
"name": "#pMESSAGE",
"layout": "${message}"
},
{
"name": "#pMACHINENAME",
"layout": "${machinename}"
},
{
"name": "#pUSERNAME",
"layout": "${aspnet-user-identity}"
},
{
"name": "#pCALLSITE",
"layout": "${callsite:filename=true}"
},
{
"name": "#pTHREADID",
"layout": "${threadid}"
},
{
"name": "#pEXCEPTIONMESSAGE",
"layout": "${exception}"
},
{
"name": "#pSTACKTRACE",
"layout": "${stacktrace}"
},
{
"name": "#pSESSIONID",
"layout": "${aspnet-sessionid}"
}
],
"dbProvider": "Oracle.ManagedDataAccess.Client.OracleConnection, Oracle.ManagedDataAccess",
"connectionString": "xxxxxxxxxxxx"
}
},
"rules": [
{
"logger": "*",
"minLevel": "Trace",
"writeTo": "all-file"
},
{
"logger": "*",
"minLevel": "Trace",
"writeTo": "db"
},
{
"logger": "Microsoft.*",
"maxLevel": "Info",
"final": true
}
]
},
The internal debugger reports:
2019-10-09 16:48:48.6665 Info Adding target AsyncTargetWrapper(Name=all-file)
2019-10-09 16:48:48.7859 Warn Error when setting property 'Layout' on 'NLog.Targets.DatabaseParameterInfo' Exception: System.ArgumentException: LayoutRenderer cannot be found: 'aspnet-user-identity'. Is NLog.Web not included?
at NLog.Config.Factory`2.CreateInstance(String itemName)
at NLog.Layouts.LayoutParser.GetLayoutRenderer(ConfigurationItemFactory configurationItemFactory, String name)
at NLog.Layouts.LayoutParser.ParseLayoutRenderer(ConfigurationItemFactory configurationItemFactory, SimpleStringReader stringReader)
at NLog.Layouts.LayoutParser.CompileLayout(ConfigurationItemFactory configurationItemFactory, SimpleStringReader sr, Boolean isNested, String& text)
at NLog.Layouts.SimpleLayout.set_Text(String value)
at NLog.Internal.PropertyHelper.TryNLogSpecificConversion(Type propertyType, String value, Object& newValue, ConfigurationItemFactory configurationItemFactory)
at NLog.Internal.PropertyHelper.SetPropertyFromString(Object obj, String propertyName, String value, ConfigurationItemFactory configurationItemFactory)
Error occurs on ${aspnet-sessionid}. If I comment out both layout, everything works well.
I found different things on GitHub issue report but all I tried was a fail.
Could someone help?
The unknown aspnet-user-identity is probably an issue with your extensions:
"extensions": [
{ "assembly": "NLog.Extensions.Logging" },
{ "assembly": "NLog.Web.AspNetCore" }
],
Could you try the above suggestion?
P.S. Updated the wiki to include example of multiple "extensions"

Resources