Syncing DynamoDB with ElasticSearch for old Data

Syncing DynamoDB with ElasticSearch for old Data - elasticsearch

I'm using this function https://github.com/bfansports/dynamodb-to-elasticsearch to sync my DynamoDB table with ElasticSearch. Unfortunately it's only processing the newly added data and Updated ones and not the previously existing rows in the table despite i chose "New and old images - both the new and the old images of the item
" in the Manage stream section.
How to fix that ?

Ok, i ended up with updating the DynamoDB and that triggers the Stream so Sync between ElasticSearch and DynamoDB can be done.
This is the script that i use :
import json
import boto3
import random
def lambda_handler(event, context):
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('image-library')
response = table.scan(
ProjectionExpression='#k',
ExpressionAttributeNames={
'#k' : 'id', #partition key
}
)
items = response['Items']
random_number = random.randint(0,1000)
for item in items:
response = table.update_item(
Key=item,
UpdateExpression='SET #f = :f',
ExpressionAttributeNames={
'#f' :'force_update'
},
ExpressionAttributeValues={
':f' : random_number
}
)

Related

Drop Mime data (csv file) into QTableView with custom QSqlTableModel is not working

I'm using a QTableView along with a superclassed QSqlTableModel to display a sqlite table into Qt and inserting new records dropping a csv file.
I have followed the doc and came up with the example reported below. I have recreated a very light reproducible example to show what is happening to my code, no sanity check or quality code has been intentionally used. It's tested against PySide6
import sys
from qtpy.QtWidgets import QApplication, QTableView, QWidget
from qtpy.QtCore import QModelIndex, QMimeData, Qt
from qtpy.QtSql import QSqlDatabase, QSqlTableModel, QSqlQuery
from pandas import read_csv
def create_table():
# Dummy very simple table
_query_str = """CREATE TABLE MyTable (
ID INTEGER PRIMARY KEY AUTOINCREMENT,
Field1 INTEGER,
Field2 TEXT);"""
query = QSqlQuery(db=db, query=_query_str)
query.exec_()
class MyTableModel(QSqlTableModel):
def __init__(self, table_name, db):
QSqlTableModel.__init__(self, db=db)
self.setTable(table_name)
def canDropMimeData(self, data: QMimeData, action: Qt.DropAction, row: int, column: int, parent: QModelIndex) -> bool:
return True # <-- Just for the example
def supportedDropActions(self) -> Qt.DropAction:
return Qt.DropAction.CopyAction | Qt.DropAction.MoveAction | Qt.DropAction.LinkAction
def dropMimeData(self, data: QMimeData, action: Qt.DropAction, row: int, column: int, parent: QModelIndex) -> bool:
csv_filename = data.urls()[0].toLocalFile()
df = read_csv(csv_filename, delimiter=',', header=0)
for _, row in df.iterrows():
record = self.record()
record.remove(0) # <-- Remove the ID field
record.setValue('Field1', row['Field1'].values[0])
record.setValue('Field2', row['Field2'].values[0])
self.insertRecord(-1, record)
if __name__ == '__main__':
# In memory database just for the purpose of the example
db = QSqlDatabase.addDatabase("QSQLITE", ":memory:")
db.open()
if not db.open():
raise "Database not opened"
create_table()
app = QApplication([])
table = QTableView()
model = MyTableModel('MyTable', db)
table.setModel(model)
table.setAcceptDrops(True)
table.show()
sys.exit(app.exec_())
What I get is that canDropMimeData and supportedDropActions are correctly called, but (using debug) dropMimeData is never called
And the below image shows that, even if canDropMimeData returns True, the file seems not to be accepted.
Edit 1 - QSqlTableModel issue
I found out that the problem is with QSqlTableModel. If I use a bare QStandardItemModel, everything works fine. Any work-around?

By default, item models don't provide drag and drop support.
In order to properly allow that, many aspects have to be checked, including that the flags() returned by any index that would accept drop must also have the Qt.ItemIsDropEnabled.
If you want to allow that only for the model (drop on an empty area, not on items), that index would be the root index, aka, an invalid index:
def flags(self, index):
flags = super().flags(index)
if not index.isValid():
flags |= Qt.ItemIsDropEnabled
return flags

AWS IOT button toggle for IFTTT

I have an AWS IOT button set up and working with IFTTT and SmartLife to turn a device on/off. Currently I have it set up to use single and double click to turn on and off, because IFTTT doesn't seem to have a toggle app (at least, not for use with SmartLife.)
How can I make it a toggle, so I can use a single click to alternately turn on and off?
Looking for a free solution.

There is a solution using apilio, but it's not a free solution: Create a toggle between two actions in IFTTT .
For a free solution, use DynamoDB from Lambda to save the button state, and invert the state each invocation. It either sends "IotButton2" or "IotButton2Off" to IFTTT.
'''
Example Lambda IOT button IFTTT toggle
Test payload:
{
"serialNumber": "GXXXXXXXXXXXXXXXXX",
"batteryVoltage": "990mV",
"clickType": "SINGLE" # or "DOUBLE" or "LONG"
}
'''
from __future__ import print_function
import boto3
import json
import logging
import urllib2
import boto3
from botocore.exceptions import ClientError
logger = logging.getLogger()
logger.setLevel(logging.INFO)
maker_key = 'xxxxxxxxxxxxxxxxx' # change this to your Maker key
def get_button_state(db, name):
table = db.Table('toggles')
try:
response = table.get_item(Key={'name': name})
except ClientError as e:
print(e.response['Error']['Message'])
else:
# response['item'] == {u'name': u'IotButton2', u'on': False}
if 'Item' in response:
return response['Item']['on']
return False
def set_button_state(db, name, state):
table = db.Table('toggles')
try:
response = table.put_item(Item={'name': name, 'on': state})
except ClientError as e:
print(e.response['Error']['Message'])
def lambda_handler(event, context):
logger.info('Received event: ' + json.dumps(event))
db = boto3.resource('dynamodb')
maker_event = "IotButton2"
# maker_event += ":" + event["clickType"]
state = get_button_state(db, maker_event)
logger.info(maker_event + " state = " + ("on" if state else "off"))
response = set_button_state(db, maker_event, not state)
if state:
maker_event += "Off"
logger.info('Maker event: ' + maker_event)
url = 'https://maker.ifttt.com/trigger/%s/with/key/%s' % (maker_event, maker_key)
f = urllib2.urlopen(url)
response = f.read()
f.close()
logger.info('"' + maker_event + '" event has been sent to IFTTT Maker channel')
return response
The above version responds to any type of click (single, double, long.) You can control 3 different switches by uncommenting this line:
maker_event += ":" + event["clickType"]
which would translate to these IFTTT events:
IotButton2:SINGLE
IotButton2:SINGLEOff
IotButton2:DOUBLE
IotButton2:DOUBLEOff
IotButton2:LONG
IotButton2:LONGOff
Create the DynamoDB table. For my example, the table name is "toggles" with one key field "name" and one boolean field "on". The table has to exist, but if the entry does not, it gets created the first time you click the button or test the Lambda function.
You have to update the Lambda function role to include your DynamoDb permissions. Add the following lines to the policy:
{
"Effect": "Allow",
"Action": [
"dynamodb:GetItem",
"dynamodb:PutItem"
],
"Resource": [
"arn:aws:dynamodb:us-east-1:xxxxxxxx:table/toggles"
]
}
(Get the ARN from AWS console DynamoDB -> table -> toggles -> Additional information.)
You can also edit the above function to handle multiple buttons, by checking the serial number.

ResourceNotFoundException while adding data to Kinesis Firehose stream using Lambda

I am trying to add data to Kinesis Firehose delivery stream using putrecord with python3.6 on aws lambda. When calling put record on the stream I get following exception.
An error occurred (ResourceNotFoundException) when calling the PutRecord operation: Stream MyStream under account 123456 not found.
I am executing following python code to add data to Stream.
import boto3
import json
def lambda_handler(event, context):
session = boto3.Session(aws_access_key_id=key_id, aws_secret_access_key=access_key)
kinesis_client = session.client('kinesis', region_name='ap-south-1')
records = event['Records']
write_records = list()
count = 0
for record in records:
count += 1
if str(record['eventName']).lower() == 'insert':
rec = record['dynamodb']['Keys']
rec.update(record['dynamodb']['NewImage'])
new_record = dict()
new_record['Data'] = json.dumps(rec).encode()
new_record['PartitionKey'] = 'PartitionKey'+str(count)
# Following Line throws Exception
kinesis_client.put_record(StreamName="MyStream", Data=new_record['Data'], PartitionKey='PartitionKey'+str(count))
elif str(record['eventName']).lower() == 'modify':
pass
write_records = json.dumps(write_records)
print(stream_data)
MyStream status is active and source for the stream data is set to Direct PUT and other sources

If you are sure that the stream name is correct, you can create client with regional endpoint of Kinesis
kinesis_client = session.client('kinesis', region_name='ap-south-1', endpoint_url='https://kinesis.ap-south-1.amazonaws.com/')
AWS Service Endpoints List
https://docs.aws.amazon.com/general/latest/gr/rande.html
Hope this helps !!!

Cloudwatch logs filter to trigger lambda then extract values from log data

Please I have got a question from the subject-line.
I want to create a AWS CloudWatch log or Event to trigger Lambda function from filter pattern then extract values from that log data as output to lambda function in python.
Example:
Filter name: abcd
value to extract: 01234 to the lambda function.
from log data
log Data:
abcd:01234
Any ideas?

Here is a simple way to capture the events from CloudWatch. The log data is in the message. You could process here or send it on to Firehose and transform there. Alternatively, you could send CloudWatch directly to Firehose with a subscription but I think that haas to be done with the AWS CLI.
import boto3
import gzip
import json
import base64
firehose = boto3.client('firehose',region_name='us-east-2')
def print_result(firehose_return):
records_error = int(firehose_return['FailedPutCount'])
records_sent = len(firehose_return['RequestResponses'])
return 'Firehose sent %d records, %d error(s)' % (records_sent,records_error )
def lambda_handler(events, context):
cw_encoded_logs_data = events['awslogs']['data']
compressed_payload = base64.b64decode(cw_encoded_logs_data)
cw_decoded_logs_data = gzip.decompress(compressed_payload)
cw_all_events = json.loads(cw_decoded_logs_data)
records = []
for event in cw_all_events['logEvents']:
log_event = {
"Data": str(event['message']) + '\n'
}
records.insert(len(records),log_event)
if len(records) > 499:
firehose_return = firehose.put_record_batch(
DeliveryStreamName = 'streamname ',
Records = records
)
print_result(firehose_return)
records = []
if len(records) > 0:
firehose_return = firehose.put_record_batch(
DeliveryStreamName = 'streamname',
Records = records
)
print(print_result(firehose_return))

Streaming to HBase with pyspark

There is a fair amount of info online about bulk loading to HBase with Spark streaming using Scala (these two were particularly useful) and some info for Java, but there seems to be a lack of info for doing it with PySpark. So my questions are:
How can data be bulk loaded into HBase using PySpark?
Most examples in any language only show a single column per row being upserted. How can I upsert multiple columns per row?
The code I currently have is as follows:
if __name__ == "__main__":
context = SparkContext(appName="PythonHBaseBulkLoader")
streamingContext = StreamingContext(context, 5)
stream = streamingContext.textFileStream("file:///test/input");
stream.foreachRDD(bulk_load)
streamingContext.start()
streamingContext.awaitTermination()
What I need help with is the bulk load function
def bulk_load(rdd):
#???
I've made some progress previously, with many and various errors (as documented here and here)

So after much trial and error, I present here the best I have come up with. It works well, and successfully bulk loads data (using Puts or HFiles) I am perfectly willing to believe that it is not the best method, so any comments/other answers are welcome. This assume you're using a CSV for your data.
Bulk loading with Puts
By far the easiest way to bulk load, this simply creates a Put request for each cell in the CSV and queues them up to HBase.
def bulk_load(rdd):
#Your configuration will likely be different. Insert your own quorum and parent node and table name
conf = {"hbase.zookeeper.qourum": "localhost:2181",\
"zookeeper.znode.parent": "/hbase-unsecure",\
"hbase.mapred.outputtable": "Test",\
"mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",\
"mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",\
"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
load_rdd = rdd.flatMap(lambda line: line.split("\n"))\#Split the input into individual lines
.flatMap(csv_to_key_value)#Convert the CSV line to key value pairs
load_rdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)
The function csv_to_key_value is where the magic happens:
def csv_to_key_value(row):
cols = row.split(",")#Split on commas.
#Each cell is a tuple of (key, [key, column-family, column-descriptor, value])
#Works well for n>=1 columns
result = ((cols[0], [cols[0], "f1", "c1", cols[1]]),
(cols[0], [cols[0], "f2", "c2", cols[2]]),
(cols[0], [cols[0], "f3", "c3", cols[3]]))
return result
The value converter we defined earlier will convert these tuples into HBase Puts
Bulk loading with HFiles
Bulk loading with HFiles is more efficient: rather than a Put request for each cell, an HFile is written directly and the RegionServer is simply told to point to the new HFile. This will use Py4J, so before the Python code we have to write a small Java program:
import py4j.GatewayServer;
import org.apache.hadoop.hbase.*;
public class GatewayApplication {
public static void main(String[] args)
{
GatewayApplication app = new GatewayApplication();
GatewayServer server = new GatewayServer(app);
server.start();
}
}
Compile this, and run it. Leave it running as long as your streaming is happening. Now update bulk_load as follows:
def bulk_load(rdd):
#The output class changes, everything else stays
conf = {"hbase.zookeeper.qourum": "localhost:2181",\
"zookeeper.znode.parent": "/hbase-unsecure",\
"hbase.mapred.outputtable": "Test",\
"mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2",\
"mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",\
"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}#"org.apache.hadoop.hbase.client.Put"}
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
load_rdd = rdd.flatMap(lambda line: line.split("\n"))\
.flatMap(csv_to_key_value)\
.sortByKey(True)
#Don't process empty RDDs
if not load_rdd.isEmpty():
#saveAsNewAPIHadoopDataset changes to saveAsNewAPIHadoopFile
load_rdd.saveAsNewAPIHadoopFile("file:///tmp/hfiles" + startTime,
"org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2",
conf=conf,
keyConverter=keyConv,
valueConverter=valueConv)
#The file has now been written, but HBase doesn't know about it
#Get a link to Py4J
gateway = JavaGateway()
#Convert conf to a fully fledged Configuration type
config = dict_to_conf(conf)
#Set up our HTable
htable = gateway.jvm.org.apache.hadoop.hbase.client.HTable(config, "Test")
#Set up our path
path = gateway.jvm.org.apache.hadoop.fs.Path("/tmp/hfiles" + startTime)
#Get a bulk loader
loader = gateway.jvm.org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles(config)
#Load the HFile
loader.doBulkLoad(path, htable)
else:
print("Nothing to process")
Finally, the fairly straightforward dict_to_conf:
def dict_to_conf(conf):
gateway = JavaGateway()
config = gateway.jvm.org.apache.hadoop.conf.Configuration()
keys = conf.keys()
vals = conf.values()
for i in range(len(keys)):
config.set(keys[i], vals[i])
return config
As you can see, bulk loading with HFiles is more complex than using Puts, but depending on your data load it is probably worth it since once you get it working it's not that difficult.
One last note on something that caught me off guard: HFiles expect the data they receive to be written in lexical order. This is not always guaranteed to be true, especially since "10" < "9". If you have designed your key to be unique, then this can be fixed easily:
load_rdd = rdd.flatMap(lambda line: line.split("\n"))\
.flatMap(csv_to_key_value)\
.sortByKey(True)#Sort in ascending order

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Syncing DynamoDB with ElasticSearch for old Data - elasticsearch

Related

Drop Mime data (csv file) into QTableView with custom QSqlTableModel is not working

AWS IOT button toggle for IFTTT

ResourceNotFoundException while adding data to Kinesis Firehose stream using Lambda

Cloudwatch logs filter to trigger lambda then extract values from log data

Streaming to HBase with pyspark

Categories

Resources