How do I prevent logging an encrypted airflow variable? - etl

I've set up an airflow which has an encrypted variable declared. I'm using BigQueryOperator. I use the encrypted variable in the SQL that is fed to the class. But the airflow logs the SQL after decrypting the variable. How can I prevent that from happening?

Unfortunately, there is no built-in way to achieve this.
A possible work-around is removing self.log.info('Executing: %s', self.sql) line in BigQueryOperator or creating a new Operator inheriting BigQueryOperator like below:
class CustomBQOperator(BigQueryOperator):
#apply_defaults
def __init__(self, *args, **kwargs):
super(CustomBQOperator).__init__(*args, **kwargs)
def execute(self, context):
if self.bq_cursor is None:
hook = BigQueryHook(
bigquery_conn_id=self.bigquery_conn_id,
use_legacy_sql=self.use_legacy_sql,
delegate_to=self.delegate_to)
conn = hook.get_conn()
self.bq_cursor = conn.cursor()
self.bq_cursor.run_query(
self.sql,
destination_dataset_table=self.destination_dataset_table,
write_disposition=self.write_disposition,
allow_large_results=self.allow_large_results,
flatten_results=self.flatten_results,
udf_config=self.udf_config,
maximum_billing_tier=self.maximum_billing_tier,
maximum_bytes_billed=self.maximum_bytes_billed,
create_disposition=self.create_disposition,
query_params=self.query_params,
labels=self.labels,
schema_update_options=self.schema_update_options,
priority=self.priority,
time_partitioning=self.time_partitioning,
api_resource_configs=self.api_resource_configs,
cluster_fields=self.cluster_fields,
)
And then using this CustomBQOperator instead of BigQueryOperator

Related

DJANGO-STORAGES, PARAMIKO: connection failure for global connection

I have a strange problem using the SFTP-API from django-storages(https://github.com/jschneier/django-storages). I am trying to use it in order to fetch media-files, which are stored on a different server and thus needed to create a Proxy for SFTP Downloads, since plain Django just sends GET-requests to the MEDIA_ROOT. I figured that Middleware provides a good hook:
import mimetypes
from storages.backends.sftpstorage import SFTPStorage
from django.http import HttpResponse
from storages.backends.sftpstorage import SFTPStorage
class SFTPMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
# Code to be executed for each request before
# the view (and later middleware) are called.
response = self.get_response(request)
try:
path = request.get_full_path()
SFTP = SFTPStorage() # <- this is where the magic happens
if SFTP.exists(path):
file = SFTP._read(path)
type, encoding = mimetypes.guess_type(path)
response = HttpResponse(file, content_type=type)
response['Content-Disposition'] = u'attachment; filename="{filename}"'.format(filename=path)
except PermissionError:
pass
return response
which works fine, but obviously it opens a new connection every time a website call is issued which I don't want (it also crashes after 3 reloads or something, I think it has to many parallel connections by then). So I tried just opening one connection to the Server via SFTP by moving the SFTP = SFTPStorage()-initialization into the __init__()-method which is just called once:
import mimetypes
from storages.backends.sftpstorage import SFTPStorage
from django.http import HttpResponse
from storages.backends.sftpstorage import SFTPStorage
class SFTPMiddleware:
def __init__(self, get_response):
self.get_response = get_response
self.SFTP = SFTPStorage() # <- this is where the magic happens
def __call__(self, request):
# Code to be executed for each request before
# the view (and later middleware) are called.
response = self.get_response(request)
try:
path = request.get_full_path()
if self.SFTP.exists(path):
file = self.SFTP._read(path)
type, encoding = mimetypes.guess_type(path)
response = HttpResponse(file, content_type=type)
response['Content-Disposition'] = u'attachment; filename="{filename}"'.format(filename=path)
except PermissionError:
pass
return response
But this implementation doesn't seem to work, the program is stuck either before the SFTP.exists() or after the SFTP._read() methods.
Can anybody tell me how to fix this problem? Or does anybody even have a better idea as to how to tackle this problem?
Thanks in advance,
Kingrimursel

Getting java.lang.RuntimeException:driver class not found when used jdbc_hook more than once in airflow operator

Use case is to run list of sql in hive and update impala metadata. As shown below two methods for hive and impala uses jdbc_hook. In which ever order I call these methods only first one runs and second one throws ERROR - java.lang.RuntimeException: Class <driver name of hive/impala> not found. Each method runs fine when used separately.
Please find the execute method of airflow custom operator :::
Note :: I can't use hive_operator to run hive statements. And I don't see any methods in HiveServer2_Hook. Am new to airflow any help is much appreciated
from airflow.models.baseoperator import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.hooks.jdbc_hook import JdbcHook
import sqlparse
class CustomHiveOperator(BaseOperator):
"""
Executes hql code and invalidates,compute stats impala for that table.
Requires JdbcHook,sqlparse.
:param hive_jdbc_conn: reference to a predefined hive database
:type hive_jdbc_conn: str
:param impala_jdbc_conn: reference to a predefined impala database
:type impala_jdbc_conn: str
:param table_name: hive table name, used for post process in impala
:type table_name: str
:param script_path: hql scirpt path to run in hive
:type script_path: str
:param autocommit: if True, each command is automatically committed.
(default value: False)
:type autocommit: bool
:param parameters: (optional) the parameters to render the SQL query with.
:type parameters: mapping or iterable
"""
#apply_defaults
def __init__(
self,
hive_jdbc_conn: str,
impala_jdbc_conn:str,
table_name:str,
script_path:str,
autocommit=False,
parameters=None,
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.hive_jdbc_conn= hive_jdbc_conn
self.impala_jdbc_conn= impala_jdbc_conn
self.table_name=table_name
self.script_path=script_path
self.autocommit=autocommit
self.parameters=parameters
def execute(self,context):
self.hive_run()
self.impala_post_process()
def format_string(self,x):
return x.replace(";","")
def hive_run(self):
with open(self.script_path) as f:
data = f.read()
hql_temp = sqlparse.split((data))
hql = [self.format_string(x) for x in hql_temp]
self.log.info('Executing: %s', hql)
self.hive_hook = JdbcHook(jdbc_conn_id=self.hive_jdbc_conn)
self.hive_hook.run(hql, self.autocommit, parameters=self.parameters)
def impala_post_process(self):
invalidate = 'INVALIDATE METADATA '+self.table_name
compute_stats = 'COMPUTE STATS '+self.table_name
hql = [invalidate,compute_stats]
self.log.info('Executing: %s', hql)
self.impala_hook = JdbcHook(jdbc_conn_id=self.impala_jdbc_conn)
self.impala_hook.run(hql, self.autocommit, parameters=self.parameters)
This is actually an issue with how Airflow uses jaydebeapi and the underlying JPype modules to facilitate the JDBC connection.
A Java virtual machine is started when JPype is first used (the first JdbcHook.get_conn call) and the only libraries that the virtual machine is made aware of is the specific one you're using for whichever JDBC connection is being made. When you create another connection the virtual machine is already started and isn't aware of the libraries necessary for a different connection type.
The only way that I have found around this is to use an extension of JdbcHook which overrides the get_conn method to gather the paths of all JDBC drivers that are defined as a Connection object in Airflow. See here for the Airflow implementation.

Trying to access an object from a listener python web framework

Pretty new to asynch so here is my question and thank you in advance.
Hi All very simple question I might be thinking too much into.
I am trying to access this cassandra client outside of these defined listeners below that get registered to a sanic main app.
I need the session in order to use an update query which will execute Asynchronously. I can definetly connect and event query from the 'setup_cassandra_session_listener' method below. But having tough time figuring how to call this Cassandra session outside and isolate so i can access else where.
from aiocassandra import aiosession
from cassandra.cluster import Cluster
from sanic import Sanic
from config import CLUSTER_HOST, TABLE_NAME, CASSANDRA_KEY_SPACE, CASSANDRA_PORT, DATA_CENTER, DEBUG_LEVEL, LOGGER_FORMAT
log = logging.getLogger('sanic')
log.setLevel('INFO')
cassandra_cluster = None
def setup_cassandra_session_listener(app, loop):
global cassandra_cluster
cassandra_cluster = Cluster([CLUSTER_HOST], CASSANDRA_PORT, DATA_CENTER)
session = cassandra_cluster.connect(CASSANDRA_KEY_SPACE)
metadata = cassandra_cluster.metadata
app.session = cassandra_cluster.connect(CASSANDRA_KEY_SPACE)
log.info('Connected to cluster: ' + metadata.cluster_name)
aiosession(session)
app.cassandra = session
def teardown_cassandra_session_listener(app, loop):
global cassandra_cluster
cassandra_cluster.shutdown()
def register_cassandra(app: Sanic):
app.listener('before_server_start')(setup_cassandra_session_listener)
app.listener('after_server_stop')(teardown_cassandra_session_listener)
Here is a working example that should do what you need. It does not actually run Cassandra (since I have no experience doing that). But, in principle this should work with any database connection you need to manage across the lifespan of your running server.
from sanic import Sanic
from sanic.response import text
app = Sanic()
class DummyCluser:
def connect(self):
print("Connecting")
return "session"
def shutdown(self):
print("Shutting down")
def setup_cassandra_session_listener(app, loop):
# No global variables needed
app.cluster = DummyCluser()
app.session = app.cluster.connect()
def teardown_cassandra_session_listener(app, loop):
app.cluster.shutdown()
def register_cassandra(app: Sanic):
# Changed these listeners to be more friendly if running with and ASGI server
app.listener('after_server_start')(setup_cassandra_session_listener)
app.listener('before_server_stop')(teardown_cassandra_session_listener)
#app.get("/")
async def get(request):
return text(app.session)
if __name__ == "__main__":
register_cassandra(app)
app.run(debug=True)
The idea is that you attach to your app instance (as you did) and then are able to simply access that inside your routes with request.app.

How to reuse aiohttp ClientSession pool?

The docs say to reuse the ClientSession:
Don’t create a session per request. Most likely you need a session per
application which performs all requests altogether.
A session contains a connection pool inside, connection reusage and
keep-alives (both are on by default) may speed up total performance.1
But there doesn't seem to be any explanation in the docs about how to do this? There is one example that's maybe relevant, but it does not show how to reuse the pool elsewhere: http://aiohttp.readthedocs.io/en/stable/client.html#keep-alive-connection-pooling-and-cookie-sharing
Would something like this be the correct way to do it?
#app.listener('before_server_start')
async def before_server_start(app, loop):
app.pg_pool = await asyncpg.create_pool(**DB_CONFIG, loop=loop, max_size=100)
app.http_session_pool = aiohttp.ClientSession()
#app.listener('after_server_stop')
async def after_server_stop(app, loop):
app.http_session_pool.close()
app.pg_pool.close()
#app.post("/api/register")
async def register(request):
# json validation
async with app.pg_pool.acquire() as pg:
await pg.execute() # create unactivated user in db
async with app.http_session_pool as session:
# TODO send activation email using SES API
async with session.post('http://httpbin.org/post', data=b'data') as resp:
print(resp.status)
print(await resp.text())
return HTTPResponse(status=204)
There're few things I think can be improved:
1)
Instance of ClientSession is one session object. This on session contains pool of connections, but it's not "session_pool" itself. I would suggest rename http_session_pool to http_session or may be client_session.
2)
Session's close() method is a corountine. Your should await it:
await app.client_session.close()
Or even better (IMHO), instead of thinking about how to properly open/close session use standard async context manager with awaiting of __aenter__ / __aexit__:
#app.listener('before_server_start')
async def before_server_start(app, loop):
# ...
app.client_session = await aiohttp.ClientSession().__aenter__()
#app.listener('after_server_stop')
async def after_server_stop(app, loop):
await app.client_session.__aexit__(None, None, None)
# ...
3)
Pay attention to this info:
However, if the event loop is stopped before the underlying connection
is closed, an ResourceWarning: unclosed transport warning is emitted
(when warnings are enabled).
To avoid this situation, a small delay must be added before closing
the event loop to allow any open underlying connections to close.
I'm not sure it's mandatory in your case but there's nothing bad in adding await asyncio.sleep(0) inside after_server_stop as documentation advices:
#app.listener('after_server_stop')
async def after_server_stop(app, loop):
# ...
await asyncio.sleep(0) # http://aiohttp.readthedocs.io/en/stable/client.html#graceful-shutdown
Upd:
Class that implements __aenter__ / __aexit__ can be used as async context manager (can be used in async with statement). It allows to do some actions before executing internal block and after it. This is very similar to regular context managers, but asyncio related. Same as regular context manager async one can be used directly (without async with) manually awaiting __aenter__ / __aexit__.
Why do I think it's better to create/free session using __aenter__ / __aexit__ manually instead of using close(), for example? Because we shouldn't worry what actually happens inside __aenter__ / __aexit__. Imagine in future versions of aiohttp creating of session will be changed with the need to await open() for example. If you'll use __aenter__ / __aexit__ you wouldn't need to somehow change your code.
seems no session pool in aiohttp.
// just post some official docs.
persistent session
here is persistent-session usage demo in official site
https://docs.aiohttp.org/en/latest/client_advanced.html#persistent-session
app.cleanup_ctx.append(persistent_session)
async def persistent_session(app):
app['PERSISTENT_SESSION'] = session = aiohttp.ClientSession()
yield
await session.close()
async def my_request_handler(request):
session = request.app['PERSISTENT_SESSION']
async with session.get("http://python.org") as resp:
print(resp.status)
//TODO: a full runnable demo code
connection pool
and it has a connection pool:
https://docs.aiohttp.org/en/latest/client_advanced.html#connectors
conn = aiohttp.TCPConnector()
#conn = aiohttp.TCPConnector(limit=30)
#conn = aiohttp.TCPConnector(limit=0) # nolimit, default is 100.
#conn = aiohttp.TCPConnector(limit_per_host=30) # default is 0
session = aiohttp.ClientSession(connector=conn)
I found this question after searching on Google on how to reuse an aiohttp ClientSession instance after my code was triggering this warning message: UserWarning: Creating a client session outside of coroutine is a very dangerous idea
This code may not solve the above problem though it is related. I am new to asyncio and aiohttp, so this may not be best practice. It's the best I could come up with after reading a lot of seemingly conflicting information.
I created a class ResourceManager taken from the Python docs that opens a context.
The ResourceManager instance handles the opening and closing of the aiohttp ClientSession instance via the magic methods __aenter__ and __aexit__ with BaseScraper.set_session and BaseScraper.close_session wrapper methods.
I was able to reuse a ClientSession instance with the following code.
The BaseScraper class also has methods for authentication. It depends on the lxml third-party package.
import asyncio
from time import time
from contextlib import contextmanager, AbstractContextManager, ExitStack
import aiohttp
import lxml.html
class ResourceManager(AbstractContextManager):
# Code taken from Python docs: 29.6.2.4. of https://docs.python.org/3.6/library/contextlib.html
def __init__(self, scraper, check_resource_ok=None):
self.acquire_resource = scraper.acquire_resource
self.release_resource = scraper.release_resource
if check_resource_ok is None:
def check_resource_ok(resource):
return True
self.check_resource_ok = check_resource_ok
#contextmanager
def _cleanup_on_error(self):
with ExitStack() as stack:
stack.push(self)
yield
# The validation check passed and didn't raise an exception
# Accordingly, we want to keep the resource, and pass it
# back to our caller
stack.pop_all()
def __enter__(self):
resource = self.acquire_resource()
with self._cleanup_on_error():
if not self.check_resource_ok(resource):
msg = "Failed validation for {!r}"
raise RuntimeError(msg.format(resource))
return resource
def __exit__(self, *exc_details):
# We don't need to duplicate any of our resource release logic
self.release_resource()
class BaseScraper:
login_url = ""
login_data = dict() # dict of key, value pairs to fill the login form
loop = asyncio.get_event_loop()
def __init__(self, urls):
self.urls = urls
self.acquire_resource = self.set_session
self.release_resource = self.close_session
async def _set_session(self):
self.session = await aiohttp.ClientSession().__aenter__()
def set_session(self):
set_session_attr = self.loop.create_task(self._set_session())
self.loop.run_until_complete(set_session_attr)
return self # variable after "as" becomes instance of BaseScraper
async def _close_session(self):
await self.session.__aexit__(None, None, None)
def close_session(self):
close_session = self.loop.create_task(self._close_session())
self.loop.run_until_complete(close_session)
def __call__(self):
fetch_urls = self.loop.create_task(self._fetch())
return self.loop.run_until_complete(fetch_urls)
async def _get(self, url):
async with self.session.get(url) as response:
result = await response.read()
return url, result
async def _fetch(self):
tasks = (self.loop.create_task(self._get(url)) for url in self.urls)
start = time()
results = await asyncio.gather(*tasks)
print(
"time elapsed: {} seconds \nurls count: {}".format(
time() - start, len(urls)
)
)
return results
#property
def form(self):
"""Create and return form for authentication."""
form = aiohttp.FormData(self.login_data)
get_login_page = self.loop.create_task(self._get(self.login_url))
url, login_page = self.loop.run_until_complete(get_login_page)
login_html = lxml.html.fromstring(login_page)
hidden_inputs = login_html.xpath(r'//form//input[#type="hidden"]')
login_form = {x.attrib["name"]: x.attrib["value"] for x in hidden_inputs}
for key, value in login_form.items():
form.add_field(key, value)
return form
async def _login(self, form):
async with self.session.post(self.login_url, data=form) as response:
if response.status != 200:
response.raise_for_status()
print("logged into {}".format(url))
await response.release()
def login(self):
post_login_form = self.loop.create_task(self._login(self.form))
self.loop.run_until_complete(post_login_form)
if __name__ == "__main__":
urls = ("http://example.com",) * 10
base_scraper = BaseScraper(urls)
with ResourceManager(base_scraper) as scraper:
for url, html in scraper():
print(url, len(html))

How to have state in a an ansible callback plugin

I want to make a ansible callback plugin, that hides sensitive data in the ansible output. There is a suggestion on how to do it here:
from ansible.plugins.callback.default import CallbackModule as CallbackModule_default
import os, collections
class CallbackModule(CallbackModule_default):
CALLBACK_VERSION = 2.0
CALLBACK_TYPE = 'stdout'
CALLBACK_NAME = 'protect_data'
def __init__(self, display=None):
super(CallbackModule, self).__init__(display)
def hide_password(self, result):
ret = {}
for key, value in result.iteritems():
if isinstance(value, collections.Mapping):
ret[key] = self.hide_password(value)
else:
if "password" in key:
ret[key] = "********"
else:
ret[key] = value
return ret
def _dump_results(self, result, indent=None, sort_keys=True, keep_invocation=False):
return super(CallbackModule, self)._dump_results(self.hide_password(result), indent, sort_keys, keep_invocation)
Now this example hides "password". I now want to make the word, that are hidden configurable at runtime of the playbook.
Can I somehow give the plugin a state (a list of words to hide) and modify it at the runtime of the playbook?
You can set self.words_list inside __init__ to some default value.
Then inside ...on_task_start and ...on_handler_task_start check for some specific variable and modify your self.words_list accordingly.
You can take a look at how persistent properties to collect statistics are used in profile_tasks callback plugin.

Resources