AWS Glue Cannot read case sensitive table from ORACLE - oracle

I am trying to bring data from Oracle table that is case sensitive to AWS S3 using AWS Glue. Oracle query looks something like below:
Select *
from myschema."Employee_Salary"
AWS Crawler is able to pull the table metadata, but the Glue job is not pulling the data, and it is giving error as "no table or view exists".
My sample code looks like below:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "orctest4", table_name = "orcl_orcl_Employee_Salary", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("empid", "decimal(10,2)", "empid", "decimal(10,2)"), ("addressdescription", "string", "addressdescription", "string"), ("salary_id", "decimal(10,2)", "salary_id", "decimal(10,2)"), ("salary_amount", "decimal(10,2)", "salary_amount", "decimal(10,2)"), ("manager_id", "decimal(10,0)", "manager_id", "decimal(10,0)")], transformation_ctx = "applymapping1")
resolvechoice2 = ResolveChoice.apply(frame = applymapping1, choice = "make_struct", transformation_ctx = "resolvechoice2")
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
datasink4 = glueContext.write_dynamic_frame.from_options(frame = dropnullfields3, connection_type = "s3", connection_options = {"path": "s3://test/my_orcle_parquet"}, format = "parquet", transformation_ctx = "datasink4")
job.commit()

Related

How to create user of elastic using flask

I want to create a user | role | privilege of elastic using API in flask
Documentation for creating user provided an example
it's working fine in elastic Dev Tools
but how can I convert it into a python POST request?
My Code
from flask import Flask, request, jsonify, render_template
from elasticsearch import Elasticsearch
CLOUD_ID = "myfirstdeployment:XXX"
ELASTIC_PASS = 'XXX'
ELASTIC_USER = 'XXX'
client = Elasticsearch(cloud_id=CLOUD_ID, basic_auth=(ELASTIC_USER, ELASTIC_PASS))
app = Flask(__name__)
import requests
from requests.structures import CaseInsensitiveDict
#app.route('/get')
def getting():
data = client.search(index="kibana_sample_data_ecommerce", body={"query" :{"match_all":{}}})
return f'{[x["_source"]["category"] for x in data["hits"]["hits"]]}'
es = Elasticsearch(hosts="https://localhost:9200", basic_auth=('elastic', 'zoU_Ec8JjbPnQNG4b8kY'), verify_certs=False)
#app.route('/local')
def local():
return f'{es.info()}'
#app.route('/users')
def getAllUser():
uri = 'https://localhost:9200/_security/user/'
es = Elasticsearch(hosts=uri, basic_auth=('elastic', 'zoU_Ec8JjbPnQNG4b8kY'), ca_certs="872ee6c0879fc0cfe73054c3ba7afb5902dbb171a2c215af35a5faab1206b924", verify_certs=False)
return f'{es.info()}'
#app.route('/users/<name>')
def getSingleUser(name):
try:
uri = f'https://localhost:9200/_security/user/{name}'
es = Elasticsearch(hosts=uri, basic_auth=('elastic', 'zoU_Ec8JjbPnQNG4b8kY'), ca_certs="872ee6c0879fc0cfe73054c3ba7afb5902dbb171a2c215af35a5faab1206b924", verify_certs=False)
return f'{es.info()}'
except:
content = {'error':'User Not Found'}
return content, 404
#app.route('/create-new-user', methods=['GET','POST'])
def createUser():
if request.method == 'POST':
username = request.form.get('username')
password = request.form.get('password')
email = request.form.get('email')
fullname = request.form.get('fullname')
role = request.form.getlist('role')
body ={"password":password, "username":username, "email":email, "fullname":fullname, "role":role}
try:
uri = f'https://localhost:9200/_security/user/{username}'
es = Elasticsearch(hosts=uri, basic_auth=('elastic', 'zoU_Ec8JjbPnQNG4b8kY'), ca_certs="872ee6c0879fc0cfe73054c3ba7afb5902dbb171a2c215af35a5faab1206b924", verify_certs=False)
return f'{es.info()}'
except:
content = {'error':'something went wrong'}
return content, 501
return render_template('add_user.html')
if __name__ == "__main__":
app.run(debug=True)
when I create a user from Stack Management > Security > User > Create
POST request send to security/user/new_user_username
post data = {password=password, username=username, email=email, role=[], fullname=fullname
first Thanks to Paulo
Using put_user() method we can easily create user
username, password & email fields are mandatory when creating a user using API
#app.route('/create-new-user', methods=['GET','POST'])
def createUser():
if request.method == 'POST':
username = request.form.get('username')
password = request.form.get('password')
email = request.form.get('email')
fullname = request.form.get('fullname')
roles = request.form.getlist('role')
body ={"password":password, "username":username, "email":email, "fullname":fullname, "roles":roles}
try:
client = Elasticsearch(hosts=https://localhost:9200/, basic_auth=(ELASTIC_USERNAME, ELASTIC_PASSWORD), ca_certs=CERTIFICATE, verify_certs=False)
es = SecurityClient(client)
es.put_user(**body)
return {'message':'User created'}, 201
except:
return {'message':'something went wrong'}, 501
return render_template('add_user.html')
Remember to pass keyword args of roles in put_user
Edited if someone experimenting can also try perform_request
Edited 2 Simple and better solution
body ={"password":password, "username":username, "email":email, "full_name":fullname, 'enabled':True, 'roles':role}
uri = f'https://localhost:9200/'
client = Elasticsearch(hosts=uri, basic_auth=(ELASTIC_USER, ELASTIC_PASS), ca_certs=CERTIFICATE, verify_certs=False)
client.perform_request(body=body, method='POST', path=f'/_security/user/{username}', headers={'content-type':'application/json', 'accept':'application/json'})

Flink is not adding any data to Elasticsearch but no errors

Folks, I'm new to all this data streaming process but I was able to build and submit a Flink job that will read some CSV data from Kafka and aggregate it then put it in Elasticsearch.
I was able to do the first two parts, and print out my aggregation to STDOUT. But when I added the code to put it to Elasticsearch, it seems nothing is happening there (no data being added). I looked at the Flink job manager log and it looks fine (no errors) and says:
2020-03-03 16:18:03,877 INFO
org.apache.flink.streaming.connectors.elasticsearch7.Elasticsearch7ApiCallBridge
- Created Elasticsearch RestHighLevelClient connected to [http://elasticsearch-elasticsearch-coordinating-only.default.svc.cluster.local:9200]
Here is my code at this point:
/*
* This Scala source file was generated by the Gradle 'init' task.
*/
package flinkNamePull
import java.time.LocalDateTime
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer010, FlinkKafkaProducer010}
import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{DataTypes, Table}
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.table.descriptors.{Elasticsearch, Json, Schema}
object Demo {
/**
* MapFunction to generate Transfers POJOs from parsed CSV data.
*/
class TransfersMapper extends RichMapFunction[String, Transfers] {
private var formatter = null
#throws[Exception]
override def open(parameters: Configuration): Unit = {
super.open(parameters)
//formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
}
#throws[Exception]
override def map(csvLine: String): Transfers = {
//var splitCsv = csvLine.stripLineEnd.split("\n")(1).split(",")
var splitCsv = csvLine.stripLineEnd.split(",")
val arrLength = splitCsv.length
val i = 0
if (arrLength != 13) {
for (i <- arrLength + 1 to 13) {
if (i == 13) {
splitCsv = splitCsv :+ "0.0"
} else {
splitCsv = splitCsv :+ ""
}
}
}
var trans = new Transfers()
trans.rowId = splitCsv(0)
trans.subjectId = splitCsv(1)
trans.hadmId = splitCsv(2)
trans.icuStayId = splitCsv(3)
trans.dbSource = splitCsv(4)
trans.eventType = splitCsv(5)
trans.prev_careUnit = splitCsv(6)
trans.curr_careUnit = splitCsv(7)
trans.prev_wardId = splitCsv(8)
trans.curr_wardId = splitCsv(9)
trans.inTime = splitCsv(10)
trans.outTime = splitCsv(11)
trans.los = splitCsv(12).toDouble
return trans
}
}
def main(args: Array[String]) {
// Create streaming execution environment
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
// Set properties per KafkaConsumer API
val properties = new Properties()
properties.setProperty("bootstrap.servers", "kafka.kafka:9092")
properties.setProperty("group.id", "test")
// Add Kafka source to environment
val myKConsumer = new FlinkKafkaConsumer010[String]("raw.data3", new SimpleStringSchema(), properties)
// Read from beginning of topic
myKConsumer.setStartFromEarliest()
val streamSource = env
.addSource(myKConsumer)
// Transform CSV (with a header row per Kafka event into a Transfers object
val streamTransfers = streamSource.map(new TransfersMapper())
// create a TableEnvironment
val tEnv = StreamTableEnvironment.create(env)
println("***** NEW EXECUTION STARTED AT " + LocalDateTime.now() + " *****")
// register a Table
val tblTransfers: Table = tEnv.fromDataStream(streamTransfers)
tEnv.createTemporaryView("transfers", tblTransfers)
tEnv.connect(
new Elasticsearch()
.version("7")
.host("elasticsearch-elasticsearch-coordinating-only.default.svc.cluster.local", 9200, "http") // required: one or more Elasticsearch hosts to connect to
.index("transfers-sum")
.documentType("_doc")
.keyNullLiteral("n/a")
)
.withFormat(new Json().jsonSchema("{type: 'object', properties: {curr_careUnit: {type: 'string'}, sum: {type: 'number'}}}"))
.withSchema(new Schema()
.field("curr_careUnit", DataTypes.STRING())
.field("sum", DataTypes.DOUBLE())
)
.inUpsertMode()
.createTemporaryTable("transfersSum")
val result = tEnv.sqlQuery(
"""
|SELECT curr_careUnit, sum(los)
|FROM transfers
|GROUP BY curr_careUnit
|""".stripMargin)
result.insertInto("transfersSum")
// Elasticsearch elasticsearch-elasticsearch-coordinating-only.default.svc.cluster.local:9200
env.execute("Flink Streaming Demo Dump to Elasticsearch")
}
}
I'm not sure how I can debug this beast... Wondering if somebody can help me figure out why the Flink job is not adding data to Elasticsearch :(
From my Flink cluster, I'm able to query Elasticsearch just fine (manually) and add records to my index:
curl -XPOST "http://elasticsearch-elasticsearch-coordinating-only.default.svc.cluster.local:9200/transfers-sum/_doc" -H 'Content-Type: application/json' -d'{"curr_careUnit":"TEST123","sum":"123"}'
A kind soul in the Flink mailist pointed out the fact that it could be Elasticsearch buffering my records... Well, it was. ;)
I have added the following options to the Elasticsearch connector:
.bulkFlushMaxActions(2)
.bulkFlushInterval(1000L)
Flink Elasticsearch Connector 7 using Scala
Please find a working and detailed answer which I have provided here.

Error in using Solr to add data - Solr HTTP error: OK (409)(HttpException )

I am trying this out for quite a time now, I have even googled a lot.
I am getting this error while trying to add data into Solr using Solarium in Laravel,
(1/1) HttpException
Solr HTTP error: OK (409)
{
"responseHeader":{
"status":409,
"QTime":3},
"error":{
"metadata":[
"error-class","org.apache.solr.common.SolrException",
"root-error-class","org.apache.solr.common.SolrException"],
"msg":"version conflict for 12 expected=12435421423451 actual=-1",
"code":409}}
in Result.php line 106
at Result->__construct(object(Client), object(Query), object(Response))in Client.php line 753
This is my function in EmployeeController.php
public function enterDataSolr()
{
$update = $this->client->createUpdate();
$doc1 = $update->createDocument();
$doc1->Gender = "M";
$doc1->Salary = 199999;
$doc1->SSN = "0050-03-10T21:00:00Z";
$doc1->City = "Mumbai";
$doc1->State = "Maharastra";
$doc1->Zip = 119973;
$doc1->Region = "Navi Mumbai";
$doc1->Password = "21435t34tgsd";
$doc1->id = 12;
$doc1->_Emp_ID = 1234546;
$doc1->Name_Prefix = "Mr.";
$doc1->First_Name = "Kant";
$doc1->Middle_Initial = "S";
$doc1->Last_Name = "Bhat";
$doc1->E_Mail = "nav#gmail.com";
$doc1->Father_s_Name = "Mant";
$doc1->Mother_s_Name = "Vandana";
$doc1->Mother_s_Maiden_Name = "vandana";
$doc1->Date_of_Birth = 12/2/1998;
$doc1->Time_of_Birth = "12:24";
$doc1->Age_in_Yrs = 21;
$doc1->Weight_in_Kgs = 56;
$doc1->Date_of_Joining = "2/2/2020";
$doc1->Quarter_of_Joining = "Q1";
$doc1->Half_of_Joining = "1st";
$doc1->Year_of_Joining = 2020;
$doc1->Month_of_Joining = 2;
$doc1->Month_Name_of_Joining = "February";
$doc1->Short_Month = "Feb";
$doc1->Day_of_Joining = 2;
$doc1->DOW_of_Joining = "Tuesday";
$doc1->Short_DOW = "Tues";
$doc1->Age_in_Company__Years_ = 2.4;
$doc1->Last___Hike = 2;
$doc1->Phone_No = 8906986022;
$doc1->Place_Name = "Delhi";
$doc1->User_Name = "kant";
$doc1->_version_ = 12435421423451;
$doc1->score = 1;
$doc2 = $update->createDocument();
$doc2->Gender = "F";
$doc2->Salary = '200000';
$doc2->SSN = "0050-03-10T00:00:00Z";
$doc2->City = "Purcellville";
$doc2->State = "VA";
$doc2->Zip = 20134;
$doc2->Region = "South";
$doc2->Password = "1";
$doc2->id = "2a69b460-2299-46a6-84b6-cf16938a1997";
$doc2->_Emp_ID = 520092;
$doc2->Name_Prefix = "Mrs.";
$doc2->First_Name = "Mary";
$doc2->Middle_Initial = "Watson";
$doc2->Last_Name = "Jane";
$doc2->E_Mail = "janemarie#hotmail.com";
$doc2->Father_s_Name = "Spder";
$doc2->Mother_s_Name = "May";
$doc2->Mother_s_Maiden_Name = "may";
$doc2->Date_of_Birth = "10/1/1921";
$doc2->Time_of_Birth = "12:02";
$doc2->Age_in_Yrs = 99;
$doc2->Weight_in_Kgs = 61;
$doc2->Date_of_Joining = "2/27/2020";
$doc2->Quarter_of_Joining = "Q2";
$doc2->Half_of_Joining = "Q1";
$doc2->Year_of_Joining = "Q4";
$doc2->Month_of_Joining = "2";
$doc2->Month_Name_of_Joining = "February";
$doc2->Short_Month = "Feb";
$doc2->Day_of_Joining = 27;
$doc2->DOW_of_Joining = "Tuesday";
$doc2->Short_DOW = "Tues";
$doc2->Age_in_Company__Years_ = 1.7;
$doc2->Last___Hike = "11%";
$doc2->Phone_No = 852489628962;
$doc2->Place_Name = "Purcellville";
$doc2->User_Name = "llwoods";
$doc2->_version_ = 1658322049611851997;
$doc2->score = 1;
$update->addDocuments(array($doc1, $doc2));
$update->addCommit();
$result = $this->client->update($update);
echo '<b>Update query executed</b><br/>';
echo 'Query status: ' . $result->getStatus(). '<br/>';
echo 'Query time: ' . $result->getQueryTime();
}
The connection is made properly as as ping() function is returning status OK.
The search function is working properly as well.
This is the constructor
public function __construct(EmployeeRepository $emp_repository, Client $client)
{
$this->emp_repository = $emp_repository;
$this->client = $client;
//dd('Solarium library version: ' . Client::VERSION . ' - ');
}
and I have used class as well
use Solarium\Client;
Optimistic Concurrency is a feature of Solr that can be used by client applications which update/replace documents to ensure that the document
they are replacing/updating has not been concurrently modified by another client application.
If there is a version conflict (HTTP error code 409), the client starts the process over.
This feature works by requiring a _version_ field on all documents in the index, and comparing that to a version specified as part of the update command.
By default, Solr’s Schema includes a _version_ field, and this field is automatically added to each new document.
$ curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/techproducts/update?_version_=1632740120218042368&versions=true&commit=true&omitHeader=true' --data-binary '
[{ "id" : "aaa",
"foo_s" : "update attempt with correct existing version" }]'
an update with a value for _version_ that matches the value in the index, and it succeeds. Because we included versions=true to the update request,
the response includes a different value for the _version_ field.
If an update with a value for _version_ embedded in the document itself. The request fails because you have specified the wrong version.
Below would be the error for it.
{
"error":{
"metadata":[
"error-class","org.apache.solr.common.SolrException",
"root-error-class","org.apache.solr.common.SolrException"],
"msg":"version conflict for aaa expected=100 actual=1632740462042284032",
"code":409
}
}
Please refer the solr documentation for more details.
The -1 here is meant that Solr is not able to find a document with that version.
I would suggest you to try sending one of the document to solr yourself by hand on the Solr admin UI.
Select your core/collection name, then click the Documents link(on the solr admin page) and you'll be at the page where you could send the document for update to solr.
Solr Document Update

IllegalStateException when trying to run spark streaming with twitter

I am new to spark and scala. I am trying to run an example given in google. I am encounting following exception when running this program.
Exception is:
17/05/25 11:13:42 ERROR ReceiverTracker: Deregistered receiver for stream 0: Restarting receiver with delay 2000ms: Error starting Twitter stream - java.lang.IllegalStateException: Authentication credentials are missing.
Code that I am executing is as follows:
PrintTweets.scala
package example
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.streaming._
import org.apache.spark.streaming.twitter._
import org.apache.spark.streaming.StreamingContext._
import org.apache.log4j.Level
import Utilities._
object PrintTweets {
def main(args: Array[String]) {
// Configure Twitter credentials using twitter.txt
setupTwitter()
val appName = "TwitterData"
val conf = new SparkConf()
conf.setAppName(appName).setMaster("local[3]")
val ssc = new StreamingContext(conf, Seconds(5))
//val ssc = new StreamingContext("local[*]", "PrintTweets", Seconds(10))
setupLogging()
// Create a DStream from Twitter using our streaming context
val tweets = TwitterUtils.createStream(ssc, None)
// Now extract the text of each status update into RDD's using map()
val statuses = tweets.map(status => status.getText())
statuses.print()
ssc.start()
ssc.awaitTermination()
}
}
Utilities.scala
package example
import org.apache.log4j.Level
import java.util.regex.Pattern
import java.util.regex.Matcher
object Utilities {
/** Makes sure only ERROR messages get logged to avoid log spam. */
def setupLogging() = {
import org.apache.log4j.{Level, Logger}
val rootLogger = Logger.getRootLogger()
rootLogger.setLevel(Level.ERROR)
}
/** Configures Twitter service credentials using twiter.txt in the main workspace directory */
def setupTwitter() = {
import scala.io.Source
for (line <- Source.fromFile("../twitter.txt").getLines) {
val fields = line.split(" ")
if (fields.length == 2) {
System.setProperty("twitter4j.oauth." + fields(0), fields(1))
}
}
}
/** Retrieves a regex Pattern for parsing Apache access logs. */
def apacheLogPattern():Pattern = {
val ddd = "\\d{1,3}"
val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?"
val client = "(\\S+)"
val user = "(\\S+)"
val dateTime = "(\\[.+?\\])"
val request = "\"(.*?)\""
val status = "(\\d{3})"
val bytes = "(\\S+)"
val referer = "\"(.*?)\""
val agent = "\"(.*?)\""
val regex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent"
Pattern.compile(regex)
}
}
When I check using print statments I find the exception is happening at line
val tweets = TwitterUtils.createStream(ssc, None)
I am giving credentials in twitter.txt file which is read properly by program. When I don't place twitter.txt in appropriate directory it shows explicit error, It shows explicit error unauthorized access when I give blank keys for customer key and secret etc in twitter.txt
If you need more details about error related information or versions of software let me know.
Thanks,
Madhu.
I could reproduce the issue with your code. I believe its your problem.
You might have not configured twitter.txt properly. Your twitter.txt file should be like this ->
consumerKey your_consumerKey
consumerSecret your_consumerSecret
accessToken your_accessToken
accessTokenSecret your_accessTokenSecret
I hope it helps.
After changing twitter.txt file syntax to following , single space between key and value it worked
consumerKey your_consumerKey
consumerSecret your_consumerSecret
accessToken your_accessToken
accessTokenSecret your_accessTokenSecret

Getting "Validation Failed: 1: no scroll ids specified" trying to read elastic search from spark

I am trying to read a elastic search index through a spark dataframe through spark-shell (Spark version 1.5.2). I don't understand what a scoll-id is or what I need to do to query elastic search from spark.
spark-shell --jars /transfer/hdp/lib/elasticsearch-spark_2.10-2.3.2.jar
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.elasticsearch.spark._
import org.elasticsearch.spark.sql._
import org.apache.spark.SparkConf
import sqlContext._
import sqlContext.implicits._
// Stop current spark context to over-ride it
sc.stop()
// Create new spark config for Elastic Search
val config = new SparkConf()
config.set("es.nodes", "*elastic-search-host-name*")
config.set("es.resource", "spark_count/spark_count")
config.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
// Start new spark context
val sc = new SparkContext(config)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// Create dataframe for reading
val sparkDF = sqlContext.esDF("spark_count/spark_count")
// Print Schema Note this works
sparkDF.printSchema()
root
|-- color: string (nullable = true)
|-- event_time: timestamp (nullable = true)
|-- event_type: string (nullable = true)
|-- new_column: string (nullable = true)
|-- spark_count: string (nullable = true)
|-- train: string (nullable = true)
// Display 20 records
sparkDF.show()
[Stage 0:> (0 + 0) / 5]16/06/20 13:30:56 ERROR TaskContextImpl: Error in TaskCompletionListener
org.elasticsearch.hadoop.rest.EsHadoopInvalidRequest: ActionRequestValidationException[Validation Failed: 1: no scroll ids specified;]
at org.elasticsearch.hadoop.rest.RestClient.checkResponse(RestClient.java:478)
at org.elasticsearch.hadoop.rest.RestClient.executeNotFoundAllowed(RestClient.java:449)
at org.elasticsearch.hadoop.rest.RestClient.deleteScroll(RestClient.java:512)
at org.elasticsearch.hadoop.rest.ScrollQuery.close(ScrollQuery.java:70)
...
I just figured it out.
In development I'm running an old version of elastic search 1.0 and while I can save data to elastic search and display index schema's the queries fail.
I tried on Elastic Search version 1.4 and it works.

Resources