Client to use get function:
def get_hist_data(exchange_id, instrument_id, frequency, start_date, end_date):
headers = fill_header()
body =
{'exchange_id':exchange_id,'instrument_id':instrument_id,
'star``t_date':start_date,'end_date':end_date,'frequency':frequency}
resp = requests.get(stock_tick_url,json.dumps(body), headers=headers)
print resp.text
Sever side:
The result is none when to 'print args'. I know that if the client code change to this: ?xx=xxx&xxx=xxxx server will work. But change to json, it doesn't work.
Anyone can help me?
parser = reqparse.RequestParser()
parser.add_argument('exchange_id', type=str, location='args')
parser.add_argument('instrument_id', type=str, location='args')
parser.add_argument('start_date', type=str, location='args')
parser.add_argument('end_date', type=str, location='args')
parser.add_argument('frequency', type=int, location='args')
def get(self):
args = parser.parse_args()
Related
I want to go to all the pages of the yelp site but cann't
this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import os
import json
import datetime
import csv
# property scraper class
class Yelp(scrapy.Spider):
# scraper name
name = 'home business'
base_url = 'https://www.yelp.com/search?'
params = {
'find_desc': 'Home Cleaning',
'find_loc':'North Dallas, Dallas, TX',
#'start' : ''
}
page = 0
current_page = 1
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
#params['start'] = page
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
url = self.base_url + urllib.parse.urlencode(self.params)
# initial HTTP request
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_listing
)
def parse_listing(self, response):
lists = response.css('h4[class="css-1l5lt1i"]')
for link in lists:
link = link.css('a::attr(href)').get()
link = 'https://www.yelp.com/' + link
#print('\n\nlink:',link,'\n\n')
yield response.follow(link, headers = self.headers, callback = self.parse_cards)
break
try:
#self.params['start'] = self.page
try:
total_pages = response.css('.text-align--center__09f24__1P1jK .css-e81eai::text').get()[5:7]
print(total_pages)
self.page +=10
self.current_page +=1
except Exception as e:
total_pages = 1
print('totl:',total_pages)
print('PAGE %s | %s ' % (self.current_page, total_pages))
if int(self.page/10) <= int(total_pages):
self.log('\n\n %s | %s\n\n ' %(self.page/10, total_pages))
next_page = response.url + '&start=' + str(self.page)
yield response.follow(url = next_page, headers = self.headers, callback = self.parse_listing)
except:
print('only single page',self.current_page)
def parse_cards(self,response):
print('\nok\n')
# main driver
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(Yelp)
process.start()
#Yelp.parse_cards(Yelp, '')
I applied try and except method also but cann't done the job.
The main problem is in the next page with the param '&start=' if i increment the start to 10 in every time then the url become every time like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=10&start=20&start=30'
and so on i want to only the url start will increment to start=10 and after them start=20 and so on.
like this
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=20'
'https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX&start=30'
and so on.
Just find the link to the next page and follow that
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
This is pretty similar to what is done in the scrapy tutorial, have you followed that? Was there a reason you couldn't do it this way?
In the end your entire spider can become
from scrapy import Spider
class Yelp(Spider):
# scraper name
name = "home business"
start_urls = [
"https://www.yelp.com/search?find_desc=Home+Cleaning&find_loc=North+Dallas%2C+Dallas%2C+TX"
]
def parse(self, response):
for link in response.css("h4 > span > a"):
yield response.follow(link, callback=self.parse_cards)
next_page = response.css("a.next-link::attr(href)").get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_cards(self, response):
print("parse_cards", response.url)
I removed the start_requests stuff to keep it simple for this example (something you should probably try to do when asking questions)
I am fairly new to airflow and I am currently trying to pass information between my SimpleHttpOperators.
This is where the data is retrieved:
request_city_information = SimpleHttpOperator(
http_conn_id='overpass',
task_id='basic_city_information',
headers={"Content-Type": "application/x-www-form-urlencoded"},
method='POST',
data=f'[out:json]; node[name={name_city}][capital]; out center;',
response_filter=lambda response: response.json()['elements'][0],
dag=dag,)
And then I want to use the response from this in the following operator:
request_city_attractions = SimpleHttpOperator(
http_conn_id='overpass',
task_id='city_attractions',
headers={"Content-Type": "application/x-www-form-urlencoded"},
method='POST',
data=f"[out:json];(nwr[tourism='attraction'][wikidata](around:{search_radius},"
f"{request_city_information.xcom_pull(context='ti')['lat']}"
f",10););out body;>;out skel qt;",
dag=dag)
As you can see I tried to access the response via request_city_information.xcom_pull(context='ti'). However, my context seems to be wrong here.
As my data is already written into the XComs I take it that I don't need XCOM_push='True', as suggested here.
There seem to be changes to XCom since airflow 2.x as many of the suggested solutions I found do not work for me.
I believe there is a major gap in my thought process, I just don't know where.
I would appreciate any references to examples or help!
Thanks in advance
I have now solved it with a completely different approach, if you guys know how the first one works I would be happy for an explanation on that.
Here is my solution:
with DAG(
'city_info',
default_args=dafault_args,
description='xcom test',
schedule_interval=None,
) as dag:
#TODO: Tasks with conn_id
def get_city_information(**kwargs):
payload = f'[out:json]; node[name={name_city}][capital]; out center;'
#TODO: Request als Connection
r = requests.post('https://overpass-api.de/api/interpreter', data=payload)
ti = kwargs['ti']
ti.xcom_push('basic_city_information', r.json())
get_city_information_task = PythonOperator(
task_id='get_city_information_task',
python_callable=get_city_information
)
def get_city_attractions(**kwargs):
ti = kwargs['ti']
city_information = ti.xcom_pull(task_ids='get_city_information_task', key='basic_city_information')
payload = f"[out:json];(nwr[tourism='attraction'][wikidata](around:{search_radius}" \
f",{city_information['elements'][0]['lat']},{city_information['elements'][0]['lon']}" \
f"););out body;>;out skel qt;"
r = requests.post('https://overpass-api.de/api/interpreter', data=payload)
#TODO: Json as Object
ti.xcom_push('city_attractions', r.json())
get_city_attractions_task = PythonOperator(
task_id='get_city_attractions_task',
python_callable=get_city_attractions
)
get_city_information_task >> get_city_attractions_task
I'm trying to get some Json data from a Jira server using Haskell. I'm counting this as "me having problems with Haskell" rather than encodings or Jira because my problem is when doing this in Haskell.
The problem occurs when the URL (or query) has plus signs. After building my request for theproject+order+by+created, Haskell prints it as:
Request {
host = "myjiraserver.com"
port = 443
secure = True
requestHeaders = [("Content-Type","application/json"),("Authorization","<REDACTED>")]
path = "/jira/rest/api/2/search"
queryString = "?jql=project%3Dtheproject%2Border%2Bby%2Bcreated"
method = "GET"
proxy = Nothing
rawBody = False
redirectCount = 10
responseTimeout = ResponseTimeoutDefault
requestVersion = HTTP/1.1
}
But the request fails with this response:
- 'Error in the JQL Query: The character ''+'' is a reserved JQL character. You must
enclose it in a string or use the escape ''\u002b'' instead. (line 1, character
21)'
So it seems like Jira didn't like Haskell's %2B. Do you have any suggestions on what I can do to fix this, or any resources that might be helpful? The same request sans the +order+by+created part is successful.
The code (patched together from these examples):
{-# LANGUAGE OverloadedStrings #-}
import Data.Aeson
import qualified Data.ByteString.Char8 as S8
import qualified Data.Yaml as Yaml
import Network.HTTP.Simple
import System.Environment (getArgs)
-- auth' is echo -e "username:passwd" | base64
foo urlBase proj' auth' = do
let proj = S8.pack (proj' ++ "+order+by+created")
auth = S8.pack auth'
request'' <- parseRequest urlBase
let request'
= setRequestMethod "GET"
$ setRequestPath "/jira/rest/api/2/search"
$ setRequestHeader "Content-Type" ["application/json"]
$ request''
request
= setRequestQueryString [("jql", Just (S8.append "project=" proj))]
$ setRequestHeader "Authorization" [S8.append "Basic " auth]
$ request'
return request
main :: IO ()
main = do
args <- getArgs
case args of
(urlBase:proj:auth:_) -> do
request <- foo urlBase proj auth
putStrLn $ show request
response <- httpJSON request
S8.putStrLn $ Yaml.encode (getResponseBody response :: Value) -- apparently this is required
putStrLn ""
_ -> putStrLn "usage..."
(If you know a simpler way to do the above then I'd take such suggestions as well, I'm just trying to do something analogous to this Python:
import requests
import sys
if len(sys.argv) >= 4:
urlBase = sys.argv[1]
proj = sys.argv[2]
auth = sys.argv[3]
urlBase += "/jira/rest/api/2/search?jql=project="
proj += "+order+by+created"
h = {}
h["content-type"] = "application/json"
h["authorization"] = "Basic " + auth
r = requests.get(urlBase + proj, headers=h)
print(r.json())
)
project+order+by+created is the URL-encoded string for the actual request project order by created (with spaces instead of +). The function setRequestQueryString expects a raw request (with spaces, not URL-encoded), and URL-encodes it.
The Python script you give for comparison essentially does the URL-encoding by hand.
So the fix is to put the raw request in proj:
foo urlBase proj' auth' = do
let proj = S8.pack (proj' ++ " order by created") -- spaces instead of +
...
I want to send some data in POST request using Tornado (AsyncHTTPClient)
rec_body = {'source': self.request.body, 'top': str(self.config["top"]), 'model': self.config["model"]}
where self.request.body is a raw binary file (image).
I try doing this:
http_client = AsyncHTTPClient()
rec_body = {'source': self.request.body, 'top': str(self.config["top"]), 'model': self.config["model"]}
request = HTTPRequest( url = os.path.join(self.config["dest_addr"], self.config["sub_sect"]) , method='POST', body =rec_body)
result = http_client.fetch( request, callback=self.handle_request)
but got this errors
File "/usr/local/lib/python2.7/dist-packages/tornado/httpclient.py", line 424, in __init__
self.body = body
File "/usr/local/lib/python2.7/dist-packages/tornado/httpclient.py", line 468, in body
self._body = utf8(value)
File "/usr/local/lib/python2.7/dist-packages/tornado/escape.py", line 203, in utf8
"Expected bytes, unicode, or None; got %r" % type(value)
TypeError: Expected bytes, unicode, or None; got <type 'dict'>
ERROR:tornado.access:500 POST /upload (192.168.72.84) 13.14ms
What I doing wrong?
I tried curl (naively), requests module all works fine, ut not asynchronously. For tornado`s AsyncHTTPClient there is good recipe from flickr.
Deals with multipart POST requests.
The code is adapted from the recipe found at :
http://code.activestate.com/recipes/146306/
No author name was given.
Author : Alexis Mignon (c)
email : alexis.mignon#gmail.Com
Date : 06/08/2011
Here is the code:
import mimetypes
from tornado.gen import coroutine, Return
from tornado.httpclient import HTTPRequest
from tornado_flickrapi.httpclient import fetch
#coroutine
def posturl(url, fields, files):
try:
response = yield post_multipart(url, fields, files)
except Exception as e:
raise e
raise Return(response)
#coroutine
def post_multipart(url, fields, files):
"""
Post fields and files to an http host as multipart/form-data.
fields is a sequence of (name, value) elements for regular form fields.
files is a sequence of (name, filename, value) elements for data to be
uploaded as files.
Return the server's response page.
"""
content_type, body = encode_multipart_formdata(fields, files)
headers = {"Content-Type": content_type, 'content-length': str(len(body))}
request = HTTPRequest(url, "POST", headers=headers, body=body, validate_cert=False)
try:
response = yield fetch(request)
except Exception as e:
raise e
raise Return(response)
def encode_multipart_formdata(fields, files):
"""
fields is a sequence of (name, value) elements for regular form fields.
files is a sequence of (name, filename, value) elements for data to be
uploaded as files.
Return (content_type, body) ready for httplib.HTTP instance
"""
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
for (key, value) in fields:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"' % key)
L.append('')
L.append(value)
for (key, filename, value) in files:
filename = filename.encode("utf8")
L.append('--' + BOUNDARY)
L.append(
'Content-Disposition: form-data; name="%s"; filename="%s"' % (
key, filename
)
)
L.append('Content-Type: %s' % get_content_type(filename))
L.append('')
L.append(value)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return content_type, body
def get_content_type(filename):
return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
I have code like this:
def delete(self, rid):
parser = reqparse.RequestParser()
parser.add_argument('rating', default=2, type=int, help='blablabla')
args = parser.parse_args()
rating = args['rating']
...
return {'message': message}
This still asks me for the rating param, and throws 400 Bad Request if no exist.
Did I miss something?
Try required=False:
parser.add_argument('rating', default=2, required=False, type=int, help='blablabla')
and check for rating in args (if 'rating' in args: pass).