How to output json using HTSQL's python library - htsql

From the example:
>>> from htsql import HTSQL
>>> htsql = HTSQL("pgsql:///htsql_demo")
>>> rows = htsql.produce("/school{name, count(department)}")
How do I convert rows into JSON? Using the JSON formatter blows up:
>>> rows = htsql.produce("/school{name, count(department)}/:json")
UnsupportedActionError: unsupported action
While processing:
/school{name, count(department)}/:json
^^^^
I'm using HTSQL 2.3.3

It has to be done via internal API:
from htsql import HTSQL
demo = HTSQL('pgsql:///htsql_demo')
rows = demo.produce('/school{name, count(department)}')
from htsql.core.fmt.emit import emit
with demo:
text = ''.join(emit('x-htsql/json', rows))
print text
The credit goes to Kirill Simonov, from the HTSQL user's group.

Related

How to change xcom in Airflow to accomodate large data?

I am using the following code in my Airflow operator:
import json
import pandas as pd
from airflow.exceptions import AirflowException
from airflow.hooks.http_hook import HttpHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
class HttpToGoogleCloudStorageOperator(BaseOperator):
template_fields = ['endpoint', 'data', 'headers', ]
template_ext = ()
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
endpoint,
project_id,
table_id,
data=None,
headers=None,
auth=None,
http_conn_id='http_default',
*args, **kwargs):
super(HttpToGoogleCloudStorageOperator, self).__init__(*args, **kwargs)
self.table_id = table_id
self.http_conn_id = http_conn_id
self.method = "GET"
self.endpoint = endpoint
self.headers = headers or {}
self.auth = auth
self.data = data or {}
def execute(self, context):
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Calling HTTP method " + self.endpoint)
response = http.run(self.endpoint, self.data, self.headers,auth=self.auth)
self.log.info("Got response")
Unfortunately the data returned is too large (about 5k) to fit in the standard xcom and I get this error:
{taskinstance.py:1059} ERROR - (_mysql_exceptions.DataError) (1406, "Data too long for column 'value' at row 1")
Is there a way I can tell http_hook to use a different xcom, or (even better) not use xcom at all? I have looked around and I do not see a solution.
Thanks for any tips or pointers.
Edit: Here is how I call the operator. Note that nowhere do I specify xcom.
query_load_task = HttpToGoogleCloudStorageOperator(
task_id="query_load_task",
endpoint=endpoint,
project_id="my_gcp_poroject_id",
table_id="dataset.table",
data=None,
auth=(username, password))
It's preferable to store data to a system designed for such (e.g.: the file system, AWS S3, Azure, etc.) and instead return a unique identifier to reference the location of the data, for the file system this would likely be the full path (e.g.: /tmp/acme_response_20200709.csv) that way you leverage the best of both the storage system and your database.
If you add your code I'd be happy to take a crack at writing up some psuedo-code as an example.

Gremlin Python project By clause

Have a graph running on Datastax Enterprise Graph (5.1 Version), running on Cassandra storage.
Trying to run a query to get both the ID and the property.
In Gremlin Console I can do this:
gremlin> g.V(1).project("v", "properties").by().by(valueMap())
==>[v:v[1],properties:[name:[marko],age:[29]]]
How can I translate the valueMap call still using Python GraphTraversal API. I know I can run a direct query via Session Execution, like this.
session.execute_graph("g.V().has(\"Node_Name\",\"A\").project(\"v\", \"properties\").by().by(valueMap())",{"name":graph_name})
Below is my setup code.
from dse.cluster import Cluster, EXEC_PROFILE_GRAPH_DEFAULT
from dse_graph import DseGraph
from dse.cluster import GraphExecutionProfile, EXEC_PROFILE_GRAPH_SYSTEM_DEFAULT
from dse.graph import GraphOptions
from gremlin_python.process.traversal import T
from gremlin_python.process.traversal import Order
from gremlin_python.process.traversal import Cardinality
from gremlin_python.process.traversal import Column
from gremlin_python.process.traversal import Direction
from gremlin_python.process.traversal import Operator
from gremlin_python.process.traversal import P
from gremlin_python.process.traversal import Pop
from gremlin_python.process.traversal import Scope
from gremlin_python.process.traversal import Barrier
graph_name = "TEST"
graph_ip = ["127.0.0.1"]
graph_port = 9042
schema = """
schema.edgeLabel("Group").create();
schema.propertyKey("Version").Text().create();
schema.edgeLabel("Group").properties("Version").add()
schema.vertexLabel("Example").create();
schema.edgeLabel("Group").connection("Example", "Example").add()
schema.propertyKey("Node_Name").Text().create();
schema.vertexLabel("Example").properties("Node_Name").add()
schema.vertexLabel("Example").index("exampleByName").secondary().by("Node_Name").add();
"""
profile = GraphExecutionProfile(
graph_options=GraphOptions(graph_name=graph_name))
client = Cluster(
contact_points=graph_ip, port=graph_port,
execution_profiles={EXEC_PROFILE_GRAPH_DEFAULT: profile}
)
graph_name = graph_name
session = client.connect()
graph = DseGraph.traversal_source(session)
# force the schema to be clean
session.execute_graph(
"system.graph(name).ifExists().drop();",
{'name': graph_name},
execution_profile=EXEC_PROFILE_GRAPH_SYSTEM_DEFAULT
)
session.execute_graph(
"system.graph(name).ifNotExists().create();",
{'name': graph_name},
execution_profile=EXEC_PROFILE_GRAPH_SYSTEM_DEFAULT
)
session.execute_graph(schema)
session.shutdown()
session = client.connect()
graph = DseGraph.traversal_source(session)
Update:
I guess i have not made the problem clear. It is in python and not in gremlin console. So running code like graph.V().has("Node_Name","A").project("v","properties").by().by(valueMap()).toList()
will give following result. How to execute the gremlin query while still remain in the GLV level, not drop down to text serialized query to Gremlin-Server?
Traceback (most recent call last):
File "graph_test.py", line 79, in <module>
graph.V().has("Node_Name","A").project("v", "properties").by().by(valueMap()).toList()
NameError: name 'valueMap' is not defined
I may not fully understand your question but it seems like you largely have the answer most of the way there. This last line of code:
graph = DseGraph.traversal_source(session)
should probably be written as:
g = DseGraph.traversal_source(session)
The return value of traversal_source(session) is a TraversalSource and not a Graph instance and by convention TinkerPop tends to refer to such a variable as g. Once you have a TraversalSource, then you can just write your Gremlin.
g = DseGraph.traversal_source(session)
g.V().has("Node_Name","A").project("v", "properties").by().by(valueMap()).toList()

Cannot find the desired link for download (Python BeautifulSoup)

I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
"""
Scrapping of all county-level raw data from
http://www.countyhealthrankings.org for all years. Data stored in RawData
folder.
Code modified from https://null-byte.wonderhowto.com/how-to/download-all-
pdfs-webpage-with-python-script-0163031/
"""
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
"""
Get all links
"""
def getAllLinks(url):
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
getAllLinks("http://www.countyhealthrankings.org/app/iowa/2017/downloads")
download(links)
(By the way, my desired link looks like this.)
Thanks!
This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('http://www.countyhealthrankings.org/app/iowa/2017/downloads')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
30
>>> for link in links:
... link.get_attribute('href')
...
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2017_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Iowa%20Data%20-%20v1.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2016_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2015_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2014_IA_v2.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Iowa%20Data%20-%20v6.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2013_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Iowa%20Data%20-%20v1_0.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2012_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2011_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2010_IA_0.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2010%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'

How can I get a one line per test result with Robot Framework?

I want to take test case results from Robot Framework runs and import those results into other tools (ElasticSearch, ALM tools, etc).
Towards that end I would like to be able to generate a text file with one line per test. Here is an example line pipe delimited:
testcase name | time run | duration | status
There are other fields I would add but those are the basic ones. Any help appreciated. I have been looking at robot.result http://robot-framework.readthedocs.io/en/3.0.2/autodoc/robot.result.html but haven't figured it out yet. If/when I do I will post answer here.
Thanks,
The output.xml file is very easy to parse with normal XML parsing libraries.
Here's a quick example:
from __future__ import print_function
import xml.etree.ElementTree as ET
from datetime import datetime
def get_robot_results(filepath):
results = []
with open(filepath, "r") as f:
xml = ET.parse(f)
root = xml.getroot()
if root.tag != "robot":
raise Exception("expect root tag 'robot', got '%s'" % root.tag)
for suite_node in root.findall("suite"):
for test_node in suite_node.findall("test"):
status_node = test_node.find("status")
name = test_node.attrib["name"]
status = status_node.attrib["status"]
start = status_node.attrib["starttime"]
end = status_node.attrib["endtime"]
start_time = datetime.strptime(start, '%Y%m%d %H:%M:%S.%f')
end_time = datetime.strptime(end, '%Y%m%d %H:%M:%S.%f')
elapsed = str(end_time-start_time)
results.append([name, start, elapsed, status])
return results
if __name__ == "__main__":
results = get_robot_results("output.xml")
for row in results:
print(" | ".join(row))
Bryan is right that it's easy to parse Robot's output.xml using standard XML parsing modules. Alternatively you can use Robot's own result parsing modules and the model you get from it:
from robot.api import ExecutionResult, SuiteVisitor
class PrintTestInfo(SuiteVisitor):
def visit_test(self, test):
print('{} | {} | {} | {}'.format(test.name, test.starttime,
test.elapsedtime, test.status))
result = ExecutionResult('output.xml')
result.suite.visit(PrintTestInfo())
For more details about the APIs used above see http://robot-framework.readthedocs.io/.

Python Requests logging to file

How can I configure logging to file requests's get or post?
my_config = {'verbose': sys.stderr}
requests.get('http://httpbin.org/headers', config=my_config)
What should I use in verbose?
Have you tried simply opening a file?
>>> import sys
>>> type(sys.stderr)
file
>>> f = open('test.log', 'w')
>>> type(f)
f
So the example above will look like this:
my_config = { 'verbose': open('/path/to/file', 'w') }
requests.get('http://httpbin.org/headers', config = my_config)
HTH

Resources