Related
Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.
Below code creates the dag (the graph is also attached) which contains 2 PythonSensors and a PythonOperator.
First Sensors creates a random integer list as data and a random boolean with 50% chance of success. It logs generated values and returns PokeReturnValue
Second sensor and Python operator both tries to get data from xcom and log them.
Graph of DAG
# region IMPORTS
import random
import logging
from datetime import datetime, timedelta
from airflow import DAG
from heliocampus.configuration.constants import Constants
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
from airflow.sensors.base import PokeReturnValue
from airflow.utils.trigger_rule import TriggerRule
from box import Box
# endregion
# region configuration
constants = Constants()
dagconfig = Box({ "Code":"Test" })
# endregion
def main() -> DAG:
# region default_args
args = dict()
args['start_date'] = datetime(2021, 1, 1)
# endregion
with DAG(dag_id=dagconfig.Code, schedule_interval="#once", default_args=args, tags=['test', 'V0.1.4']) as dag:
start = EmptyOperator(task_id="start")
# region Sensors
check_all_expired_tables = PythonSensor(
task_id="CHECK_ALL_EXPIRED_TABLES",
poke_interval=timedelta(seconds=20).total_seconds(),
timeout=timedelta(minutes=1).total_seconds(),
mode="reschedule",
python_callable=check_expired_tables,
trigger_rule=TriggerRule.ALL_SUCCESS
)
check_all_expired_tables_notification = PythonOperator(
task_id="CHECK_ALL_EXPIRED_TABLES_NOTIFICATION",
python_callable=sensor_result_nofitication,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_FAILED
)
verify_ods_operator = PythonSensor(
task_id="VERIFY_ODS",
poke_interval=timedelta(seconds=30).total_seconds(),
timeout=timedelta(hours=2).total_seconds(),
mode="reschedule",
python_callable=verify_ods,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_SUCCESS
)
# endregion
end = EmptyOperator(task_id="end")
start >> check_all_expired_tables >> verify_ods_operator >> end
check_all_expired_tables >> check_all_expired_tables_notification
return dag
# region Notifications
def sensor_result_nofitication(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor return_value from {notification_source} is {actual_xcom_value}")
# endregion
def check_expired_tables():
randomlist = random.sample(range(10, 30), 5)
randomResult = random.randint(0, 100) > 50
logging.info(f"check_expired_tables : returning PokeReturnValue(is_done={randomResult}, xcom_value={randomlist})")
return PokeReturnValue(is_done=randomResult, xcom_value=randomlist)
def verify_ods(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"verify_ods : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"verify_ods : Sensor return_value from {notification_source} is {actual_xcom_value}")
rnd = random.randint(0, 100)
logging.info("Random Number : {num}".format(num=rnd))
return (rnd > 20)
main()
Regardless of whether the first sensor is successfull or not the data from xcom can not be logged in the second sensor or python operator.
I don't know if the problem is on the pushing side or pulling side.
I can not see any rows inserted in airflow database (xcom table).
The problem lives in the PythonSensor which is coercing the return of the python callable to boolean without checking its type first:
return_value = self.python_callable(*self.op_args, **self.op_kwargs)
return PokeReturnValue(bool(return_value))
To get the expected behavior something like this needs to be added to the PythonSensor:
return return_value if isinstance(return_value, PokeReturnValue) else PokeReturnValue(bool(return_value)
import requests
import telegram
import json
from telegram.ext import Updater, CommandHandler
import time
import sys
import pandas as pd
from apscheduler.schedulers.background import BlockingScheduler
from apscheduler.jobstores.base import JobLookupError
dt = requests.get('https://min-api.cryptocompare.com/data/price?fsym=BTC&tsyms=USD,EUR')
print(dt.text)
price_now = dt.json()
bot_token = "5668522544:AAFqNFcgd5wDBtQbJBhRayfPx9VpVPVjcyQ"
Cointimeline = telegram.Bot(token=bot_token)
updates = Cointimeline.getUpdates()
for i in updates:
print(i.message)
class Chatbot:
def __init__(self, token):
self.core = telegram.Bot(token)
self.updater = Updater(token)
self.id = 5734902861
def sendmsg(self, text):
self.core.sendmsg(chat_id=self.id, text=text)
def stop(self):
self.updater.stop()
class Alert(Chatbot):
def __init__(self):
self.token = '5668522544:AAFqNFcgd5wDBtQbJBhRayfPx9VpVPVjcyQ'
Chatbot.__init__(self, self.token)
self.updater.stop()
def controller(self, cmd, func):
self.updater.dispatcher.controller(CommandHandler(smd, func))
def start(self):
self.sendMessage('')
aps = BlockingScheduler()
def push():
dt = requests.get("https://min-api.cryptocompare.com/data/price?fsym=BTC&tsyms=USD,EUR")
ALERTBOT = Alert()
ALERTBOT.sendmsg(dt.text)a
price_now = pd.DataFrame({"USD": {list(dt.json().values())[0]}, "EUR": [list(dt.json().values())[1]]})
data = pd.read_csv("ALERTBOT.csv")
data = data.append(price_now, sort=True)
data = data.loc[:, 'USD':'EUR']
data.to_csv("ALERTBOT.csv")
aps.add_job(push, 'interval', seconds=60)
aps.start()
The error continues to occur.
PytzUsageWarning: The zone attribute is specific to pytz's interface; please migrate to a new time zone provider. For more details on how to do so, see https://pytz-deprecation-shim.readthedocs.io/en/latest/migration.html
if obj.zone == 'local':
The error is caused by the pytz library being out of date. You can solve the problem by updating the library:
pip install --upgrade pytz
I need to download multiple 10-ks documents, however, this code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function]. Here's code.
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
from tqdm import tqdm
Here's the py file that includes project_helper functions.
import matplotlib.pyplot as plt
import requests
from ratelimit import limits, sleep_and_retry
class SecAPI(object):
SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#staticmethod
#sleep_and_retry
# Dividing the call limit by half to avoid coming close to the limit
#limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
def _call_sec(url):
return requests.get(url)
def get(self, url):
return self._call_sec(url).text
def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
indentation = ' '
print('[')
for ten_k in ten_k_data:
print_statement = '{}{{'.format(indentation)
for field in fields:
value = str(ten_k[field])
# Show return lines in output
if isinstance(value, str):
value_str = '\'{}\''.format(value.replace('\n', '\\n'))
else:
value_str = str(value)
# Cut off the string if it gets too long
if len(value_str) > field_length_limit:
value_str = value_str[:field_length_limit] + '...'
print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)
print_statement += '},'
print(print_statement)
print(']')
The first step it to download NLP Corpora.
nltk.download('stopwords')
nltk.download('wordnet')
Than Get 10ks
#cik_lookup = {
# 'GOOGL':'0001288776',
# 'AAPL':'0000320193',
# 'FACEBOOK':'0001326801',
# 'AMZN':'0001018724',
# 'MSFT':'0000789019'}
cik_lookup = {
'AEP': '0000004904',
'AXP': '0000004962',
'BA': '0000012927',
'BK': '0001390777',
'CAT': '0000018230',
'DE': '0000315189',
'DIS': '0001001039',
'DTE': '0000936340',
'ED': '0001047862',
'EMR': '0000032604',
'ETN': '0001551182',
'GE': '0000040545',
'IBM': '0000051143',
'IP': '0000051434',
'JNJ': '0000200406',
'KO': '0000021344',
'LLY': '0000059478',
'MCD': '0000063908',
'MO': '0000764180',
'MRK': '0000310158',
'MRO': '0000101778',
'PCG': '0001004980',
'PEP': '0000077476',
'PFE': '0000078003',
'PG': '0000080424',
'PNR': '0000077360',
'SYY': '0000096021',
'TXN': '0000097476',
'UTX': '0000101829',
'WFC': '0000072971',
'WMT': '0000104169',
'WY': '0000106535',
'XOM': '0000034088'}
Get list of 10-ks
sec_api = project_helper.SecAPI()
from bs4 import BeautifulSoup
def get_sec_data(cik, doc_type, start=0, count=60):
newest_pricing_data = pd.to_datetime('2021-01-01')
rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
'&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
.format(cik, doc_type, start, count)
sec_data = sec_api.get(rss_url)
feed = BeautifulSoup(sec_data.encode('utf-8'), 'xml').feed
entries = [
(
entry.content.find('filing-href').getText(),
entry.content.find('filing-type').getText(),
entry.content.find('filing-date').getText())
for entry in feed.find_all('entry', recursive=False)
if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
return entries
example_ticker = 'AEP'
sec_data = {}
for ticker, cik in cik_lookup.items():
sec_data[ticker] = get_sec_data(cik, '10-K')
The code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function] I get the following error. The first error I got is as below.
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-8-28a784054794> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-8-28a784054794> in get_sec_data(cik, doc_type, start, count)
5 rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' .format(cik, doc_type, start, count)
6 sec_data = sec_api.get(rss_url)
----> 7 feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
8 entries = [
9 (
UnicodeEncodeError: 'ascii' codec can't encode characters in position 2599-2601: ordinal not in range(128)
However, after some google search over BeutifulSoup(ecodes) I changed it to utf-8 and then got the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-9c77ed07af2d> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-9-9c77ed07af2d> in get_sec_data(cik, doc_type, start, count)
11 entry.content.find('filing-type').getText(),
12 entry.content.find('filing-date').getText())
---> 13 for entry in feed.find_all('entry', recursive=False)
14 if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
15
AttributeError: 'NoneType' object has no attribute 'find_all'
The project can be accessed here at the following github repo.
github repo herealso.
I am using bokeh 0.12.9. I have a table and a figure which I replace in the global layout on callback. I usually build the ColumnDataSource right before I build the new figure/table. Now I wanted to try and see if I can have a global ColumnDataSource so that I can adjust the data via a CDSView (no need to replace table/figure then).
Unfortunately even keeping a separate CDS and view for table and plot fails. When clicking the radio button a couple of times I receive the following javascript error:
Uncaught TypeError: Cannot read property 'data' of undefined
from datetime import date
from random import randint
from bokeh.models import Line
import numpy as np
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
import bokeh.layouts as layouts
import bokeh.models.widgets as widgets
from bokeh.io import curdoc
from bokeh.models import CustomJS, Slider
from bokeh import palettes
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, CDSView, IndexFilter
from bokeh.models import widgets
def gen_plot(source=None, view=None):
p = figure(title='test',
x_axis_type="datetime",
plot_width=600, plot_height=400)
colors = palettes.Category10[10]
cols = [str(col) for col in source.column_names]
for ix, col in enumerate(cols):
if col == 'index':
continue
r = p.line(x='index', y=col, source=source, view=view,
legend='_' + col,
color=colors[ix])
p.legend.location = "bottom_left"
return p
def gen_table(source=None, view=None):
columns = [TableColumn(field=ele, title=ele) for ele
in source.column_names]
tab = widgets.DataTable(source=source, view=view, columns=columns,
selectable=False,
reorderable=False,
width=600, height=400)
return tab
def update(attr, old, new):
p = gen_plot(source=cdss[0], view=vs[0])
t = gen_table(source=cdss[1], view=vs[1])
print l.children
l.children[1] = p
l.children[2].children[0] = t
# set up data
cols = ['col1', 'col2', 'col3', 'col4']
df1 = pd.DataFrame(pd.util.testing.getTimeSeriesData())
df1.columns = cols
df2 = pd.DataFrame(pd.util.testing.getTimeSeriesData())
df2.columns = cols
dfs = [df1, df2]
cds1 = ColumnDataSource(df1)
cds2 = ColumnDataSource(df2)
cdss = [cds1, cds2]
filters = [IndexFilter([0, 1, 2, 4])]
filters = []
v1 = CDSView(source=cds1, filters=filters)
v2 = CDSView(source=cds2, filters=filters)
vs = [v1, v2]
# initialize items to replace
p = gen_plot(source=cdss[0], view=vs[0])
t = gen_table(source=cdss[1], view=vs[1])
# initialize controls
radio_wghting = widgets.RadioButtonGroup(labels=["Equal", "Exponential"],
active=0)
radio_wghting.on_change('active', update)
# set up layout
sizing_mode = 'fixed'
l = layout([radio_wghting, p, t], sizing_mode=sizing_mode)
curdoc().add_root(l)
curdoc().title = 'blub'
# call callback initially
update('value', 0, 0)
Any hints are much appreciated!
Now I wanted to try and see if I can have a global ColumnDataSource so
that I can adjust the data via a CDSView (no need to replace
table/figure then).
The code you are showing is the one in which you are trying to replace the figure and table.
When you replace the child of a layout object in that way, you are not actually removing the previous figures from curdoc, and other elements in the document still have the old figures and tables in their references.
You could try something like that to update the sources directly.
for rend in p.renderers:
try:
rend.data_source
except AttributeError:
pass
else:
rend.data_source.data.update(new_data_dictionary)
and
t.source.data.update(new_data_dictionary)
EDIT to answer the comment
from bokeh.io import curdoc
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Button
from bokeh.layouts import gridplot, widgetbox
from random import random, choice
import numpy as np
my_data = {1:{'x':[],'y':[],'colo':[],'size':[]}}
kelly_colors = [ '#F3C300','#875692', '#F38400', '#A1CAF1','#BE0032', '#C2B280', '#848482','#008856', '#E68FAC', '#0067A5',
'#F99379', '#604E97', '#F6A600','#B3446C', '#DCD300', '#882D17','#8DB600', '#654522', '#E25822','#2B3D26', ]
x = np.arange(0,50,0.1)
def rand_dict():
rand_x = [choice(x) for i in range(7)]
return {'x':rand_x,'y':np.array([random()*100 for i in rand_x]),'colo':np.array([choice(kelly_colors) for i in rand_x]),'size':np.array([(5+int(random()*50)) for i in rand_x])}
def add_stuff():
global my_data
my_data[max(my_data.keys())+1] = rand_dict()
make_doc()
def change_stuff():
global my_data
myfig = curdoc().select_one({"name":"myfig"})
for i,rend in enumerate(myfig.renderers):
try:
rend.data_source
except AttributeError:
pass
else:
my_data[i+1] = rand_dict()
rend.data_source.data.update(my_data[i+1])
def clear_stuff():
global my_data
my_data = {1:{'x':[],'y':[],'colo':[],'size':[]}}
make_doc()
def make_doc():
curdoc().clear()
myfig = figure(plot_width=1000,plot_height=800,outline_line_alpha=0,name='myfig')
myfig.x_range.start = -5
myfig.x_range.end = 55
myfig.y_range.start = -10
myfig.y_range.end = 110
myfig.renderers = []
add_button = Button(label='add stuff',width=100)
change_button = Button(label='change stuff',width=100)
clear_button = Button(label='clear stuff',width=100)
add_button.on_click(add_stuff)
change_button.on_click(change_stuff)
clear_button.on_click(clear_stuff)
grid = gridplot([[myfig,widgetbox(add_button,change_button,clear_button)]],toolbar_location=None)
curdoc().add_root(grid)
update_doc()
def update_doc():
myfig = curdoc().select_one({"name":"myfig"})
for key in my_data:
myfig.scatter(x='x',y='y',color='colo',size='size',source=ColumnDataSource(data=my_data[key]))
curdoc().title = 'mytitle'
make_doc()
what I like about doing this is that you can just save the my_data dictionary with numpy, load it later and keep changing your plots from there.
def load_data():
global my_data
my_data = np.load(path_to_saved_data).item()
make_doc()
You can probably do something similar using pandas dataframes, I am just more comfortable with plain dictionaries.