I'm trying to simulate reading data from a queue such as kafka. I need to maintain a pointer to the current record when I am streaming data from a file. Currently I'm doing this with a global variable, but this appears to be shared across all user sessions.
How can I save this user state in bokeh?
def modify_doc(doc):
df_all = pd.read_csv('data.csv')
df_all['Date'] = pd.to_datetime(df_all['Date'])
start_data = df_all[0:10].to_dict(orient='list')
source = ColumnDataSource(data=start_data)
...
def callback():
# FIXME: how can we save the current_record in the user's session?
global current_record
try:
current_record
except NameError:
current_record = 10
df = df_all[current_record:current_record+1]
if df.shape[0] > 0:
# we have another record so display it
new_data = df.to_dict(orient='list')
source.stream( new_data )
current_record = current_record + 1
doc.add_root(plot)
doc.add_periodic_callback(callback, 250)
I've seen the documentation for ClientSession, but this appears to work at the whole document level?
I've included a Minimal, Complete and Verifiable example below:
file: bokeh_server.py
Run locally with: python3 bokeh_server.py
import pandas as pd
from tornado.ioloop import IOLoop
import yaml
from jinja2 import Template
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, Slider, Div
from bokeh.plotting import figure
from bokeh.server.server import Server
from bokeh.themes import Theme
from bokeh.client import push_session
import os
# if running locally, listen on port 5000
PORT = int(os.getenv('PORT', '5000'))
HOST = "0.0.0.0"
try:
# This is set in the cloud foundry manifest. If we are running on
# cloud foundry, this will be set for us.
ALLOW_WEBSOCKET_ORIGIN = os.getenv("ALLOW_WEBSOCKET_ORIGIN").split(',')
except:
# We are not running on cloud foundry so we must be running locally
ALLOW_WEBSOCKET_ORIGIN = [ 'localhost:{0}'.format(PORT) ]
io_loop = IOLoop.current()
# This example simulates reading from a stream such as kafka
def modify_doc(doc):
df_all = pd.read_csv('data.csv')
df_all['Date'] = pd.to_datetime(df_all['Date'])
start_data = df_all[0:10].to_dict(orient='list')
source = ColumnDataSource(data=start_data)
plot = figure(x_axis_type='datetime',
y_range=(0, 10000000),
y_axis_label='Y Label',
title="Title")
plot.line('Date', 'ALL_EXCL_FUEL', color='blue', alpha=1, source=source)
plot.line('Date', 'MOSTLY_FOOD', color='lightblue', alpha=1, source=source)
plot.line('Date', 'NON_SPECIALISED', color='grey', alpha=1, source=source)
def callback():
# FIXME: how can we save this in the user's session?
global counter
try:
counter
except NameError:
counter = 10
df = df_all[counter:counter+1]
if df.shape[0] > 0:
# hardcode update values for now
new_data = df.to_dict(orient='list')
source.stream( new_data )
counter = counter + 1
doc.add_root(plot)
doc.add_periodic_callback(callback, 250)
bokeh_app = Application(FunctionHandler(modify_doc))
server = Server(
{'/': bokeh_app},
io_loop=io_loop,
allow_websocket_origin=ALLOW_WEBSOCKET_ORIGIN,
**{'port': PORT, 'address': HOST}
)
server.start()
if __name__ == '__main__':
io_loop.add_callback(server.show, "/")
io_loop.start()
file: data.csv
Date,ALL_EXCL_FUEL,MOSTLY_FOOD,NON_SPECIALISED,TEXTILE,HOUSEHOLD,OTHER,NON_STORE
1986 Jan,1883154,747432,163708,267774,261453,281699,161088
1986 Feb,1819796,773161,152656,223836,246502,275121,148519
1986 Mar,1912582,797104,169440,251438,249614,292348,152638
1986 Apr,1974419,809334,170540,275975,260086,299271,159213
1986 May,1948915,800193,170173,274979,251175,297655,154740
1986 Jun,2019114,821785,178366,295463,251507,311447,160546
1986 Jul,2051539,816033,184812,297969,269786,323187,159752
1986 Aug,2011746,804386,180911,297138,263427,310220,155665
1986 Sep,2046678,792943,181055,305350,280640,318368,168322
1986 Oct,2110669,810147,187728,308919,298637,325617,179621
1986 Nov,2315710,847794,231599,352009,332079,358077,194152
1986 Dec,2830206,970987,319570,490001,373714,469399,206536
1987 Jan,2032021,798562,172215,288186,288534,307900,176624
1987 Feb,1980748,805713,165682,247219,282836,313577,165721
1987 Mar,2009717,816051,174034,256756,280207,315562,167106
1987 Apr,2156967,862749,189729,308543,284440,336755,174751
1987 May,2075808,834375,175464,287515,280404,330093,167957
1987 Jun,2137092,844051,183014,304706,286522,345149,173651
1987 Jul,2208377,847098,198848,330804,301537,356037,174054
I performed some testing and found that every time a new browser session was opened with the bokeh chart url, a new bokeh Document instance was created. The answer for me was to save the state in the document instance:
def modify_doc(doc):
# The first 100 records of data.csv will be loaded immediately
# The remaining records will be read one-by-one in the update
# callback which is used to simulate new, realtime data arriving
doc.realtime_rec_ptr = 100
df_all = pd.read_csv('data.csv')
df_all['Date'] = pd.to_datetime(df_all['Date'])
start_data_df = df_all[0:doc.realtime_rec_ptr]
start_data_df.loc[ :, 'color' ] = 'green'
src = ColumnDataSource(data=start_data_df.to_dict(orient='list'))
p = figure(x_axis_type='datetime', title="Title"
y_range=(0, 10000000), y_axis_label='Y Label')
p.line('Date','ALL_EXCL_FUEL',color='blue',alpha=1,source=src)
# realtime markers will be colored green, others will be blue
p.circle('Date','ALL_EXCL_FUEL',color='color',fill_alpha=0.2,size=4,source=src)
def callback():
df = df_all[doc.realtime_rec_ptr:realtime_rec_ptr+1]
if df.shape[0] > 0:
df.loc[ :, 'color' ] = 'blue'
new_data = df.to_dict(orient='list')
#print(new_data)
source.stream( new_data )
doc.realtime_rec_ptr = doc.realtime_rec_ptr + 1
doc.add_root(p)
doc.add_periodic_callback(callback, 250)
Related
Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.
from jupyter_plotly_dash import JupyterDash
import dash
import dash_leaflet as dl
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import dash_table
from dash.dependencies import Input, Output
import base64
import os
import numpy as np
import pandas as pd
from pymongo import MongoClient
from Module import AnimalShelter
username = "username"
password = "password"
animal = AnimalShelter(username, password)
df = pd.DataFrame.from_records(animal.readAll({}))
#########################
# Dashboard Layout / View
#########################
app = JupyterDash('Dash DataTable Only')
image_filename = 'Grazioso Salvare Logo.png' # customer image
encoded_image = base64.b64encode(open(image_filename, 'rb').read())
app.layout = html.Div([
html.Center(html.Img(src='data:image/png;base64,{}'.format(encoded_image.decode()))),
html.Center(html.B(html.H1('Kristopher Collis'))),
html.Hr(),
html.Div(
#Radio Items to select the rescue filter options
dcc.RadioItems(
id='filter-type',
),
),
html.Hr(),
dash_table.DataTable(
id='datatable-id',
columns=[
{"name": i, "id": i, "deletable": False, "selectable": True} for i in df.columns
],
data=df.to_dict('records'),
editable=False,
filter_action="native",
sort_action="native",
sort_mode="multi",
column_selectable=False,
row_selectable="multi",
row_deletable=False,
selected_columns=[],
selected_rows=[],
page_action="native",
page_current= 0,
page_size= 10,
),
html.Hr(),
html.Div(className='row',
style={'display' : 'flex'},
children =[
html.Div(
id='graph-id',
className='col s12 m6'
),
html.Div(
id = 'map-id',
className='col s12 m6'
),
]
),
html.Br(),
html.Hr(),
])
#############################################
# Interaction Between Components / Controller
#############################################
# #callback for Piechart
#app.callback(
Output('graph-id', "children"),
[Input('datatable-id', "derived_viewport_data")])
#fucntion for update_graph
def update_graphs(viewData):
dff = pd.DataFrame.from_dict(viewData)
names = dff['breed'].value_counts().keys().tolist()
values = dff['breed'].value_counts().tolist()
return[
dcc.Graph(
id = 'graph-id',
fig = px.pie(data_frame = dff,values = values,names = names,
color_discrete_sequence = px.colors.sequential.RdBu,width = 800,height = 500
)
)
]
#callback for update_map
#app.callback(
Output('map-id', "children"),
[Input('datatable-id', "derived_viewport_data"),
Input('datatable-id', 'selected_rows'),
Input('datatable-id', 'selected_columns')])
#update_function with variables
def update_map(viewData, selected_rows, selected_columns):
dff = pd.DataFrame.from_dict(viewData)
#width, height of map, center of map, and how much zoom do you want for map
return [dl.Map(style = {'width': '1000px', 'height': '500px'}, center = [30.75,-97.48], zoom = 7,
children = [dl.TileLayer(id = "base-layer-id"),
#marker with tool tip and popup
dl.Marker(position=[(dff.iloc[selected_rows[0],13]), (dff.iloc[selected_rows[0],14])], children=[
dl.Tooltip(dff.iloc[selected_rows[0],4]),
dl.Popup([
html.H4("Animal Name"),
html.P(dff.iloc[selected_rows[0],9]),
])
])
])
]
app
When I run the program, the geoloction map populates but not the graph. I was able to populate the graph a couple of times finding information on plotly, and other documentation. I have spent a while trying to figure out why the graph will not display again. I did attempt to use fig.show() at the bottom of the update_graphs function. I am not sure if that was what made it work, but I am stumped. I am respectfully requesting help finding the error in the def update_graphs function.
Below code creates the dag (the graph is also attached) which contains 2 PythonSensors and a PythonOperator.
First Sensors creates a random integer list as data and a random boolean with 50% chance of success. It logs generated values and returns PokeReturnValue
Second sensor and Python operator both tries to get data from xcom and log them.
Graph of DAG
# region IMPORTS
import random
import logging
from datetime import datetime, timedelta
from airflow import DAG
from heliocampus.configuration.constants import Constants
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
from airflow.sensors.base import PokeReturnValue
from airflow.utils.trigger_rule import TriggerRule
from box import Box
# endregion
# region configuration
constants = Constants()
dagconfig = Box({ "Code":"Test" })
# endregion
def main() -> DAG:
# region default_args
args = dict()
args['start_date'] = datetime(2021, 1, 1)
# endregion
with DAG(dag_id=dagconfig.Code, schedule_interval="#once", default_args=args, tags=['test', 'V0.1.4']) as dag:
start = EmptyOperator(task_id="start")
# region Sensors
check_all_expired_tables = PythonSensor(
task_id="CHECK_ALL_EXPIRED_TABLES",
poke_interval=timedelta(seconds=20).total_seconds(),
timeout=timedelta(minutes=1).total_seconds(),
mode="reschedule",
python_callable=check_expired_tables,
trigger_rule=TriggerRule.ALL_SUCCESS
)
check_all_expired_tables_notification = PythonOperator(
task_id="CHECK_ALL_EXPIRED_TABLES_NOTIFICATION",
python_callable=sensor_result_nofitication,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_FAILED
)
verify_ods_operator = PythonSensor(
task_id="VERIFY_ODS",
poke_interval=timedelta(seconds=30).total_seconds(),
timeout=timedelta(hours=2).total_seconds(),
mode="reschedule",
python_callable=verify_ods,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_SUCCESS
)
# endregion
end = EmptyOperator(task_id="end")
start >> check_all_expired_tables >> verify_ods_operator >> end
check_all_expired_tables >> check_all_expired_tables_notification
return dag
# region Notifications
def sensor_result_nofitication(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor return_value from {notification_source} is {actual_xcom_value}")
# endregion
def check_expired_tables():
randomlist = random.sample(range(10, 30), 5)
randomResult = random.randint(0, 100) > 50
logging.info(f"check_expired_tables : returning PokeReturnValue(is_done={randomResult}, xcom_value={randomlist})")
return PokeReturnValue(is_done=randomResult, xcom_value=randomlist)
def verify_ods(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"verify_ods : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"verify_ods : Sensor return_value from {notification_source} is {actual_xcom_value}")
rnd = random.randint(0, 100)
logging.info("Random Number : {num}".format(num=rnd))
return (rnd > 20)
main()
Regardless of whether the first sensor is successfull or not the data from xcom can not be logged in the second sensor or python operator.
I don't know if the problem is on the pushing side or pulling side.
I can not see any rows inserted in airflow database (xcom table).
The problem lives in the PythonSensor which is coercing the return of the python callable to boolean without checking its type first:
return_value = self.python_callable(*self.op_args, **self.op_kwargs)
return PokeReturnValue(bool(return_value))
To get the expected behavior something like this needs to be added to the PythonSensor:
return return_value if isinstance(return_value, PokeReturnValue) else PokeReturnValue(bool(return_value)
Two simple questions:
Does Warp10 integrate into streamlit to feed visualisations?
If so, please would you specify how this can be accomplished?
Thanking you in advance.
Best wishes,
There's no direct integration of Warp 10 in streamlit.
Although streamlit can handle any kind of data, it's mainly focused on pandas DataFrame. DataFrames are tables whereas Warp 10 Geo Time Series are time series. So even if Warp 10 was integrated in streamlit, it would require some code to properly format the data for streamlit to give its full potential.
That being said, here is a small example on how to display data stored in Warp 10 with streamlit:
import json
from datetime import datetime, timedelta
import requests
import streamlit as st
from bokeh.palettes import Category10_10 as palette
from bokeh.plotting import figure
# Should be put in a configuration file.
fetch_endpoint = 'http://localhost:8080/api/v0/fetch'
token = 'READ' # Change that to your actual token
def load_data_as_json(selector, start, end):
headers = {'X-Warp10-Token': token}
params = {'selector': selector, 'start': start, 'end': end, 'format': 'json'}
r = requests.get(fetch_endpoint, params=params, headers=headers)
return r.text
st.title('Warp 10 Test')
# Input parameters
selector = st.text_input('Selector', value="~streamlit.*{}")
start_date = st.date_input('Start date', value=datetime.now() - timedelta(days=10))
start_time = st.time_input('Start time')
end_date = st.date_input('End date')
end_time = st.time_input('End time')
# Convert datetime.dates and datetime.times to microseconds (default time unit in Warp 10)
start = int(datetime.combine(start_date, start_time).timestamp()) * 1000000
end = int(datetime.combine(end_date, end_time).timestamp()) * 1000000
# Make the query to Warp 10 and get back a json.
json_data = load_data_as_json(selector, start, end)
gtss = json.loads(json_data)
# Iterate through the json and populate a Bokeh graph.
p = figure(title='GTSs', x_axis_label='time', y_axis_label='value')
for gts_index, gts in enumerate(gtss):
tss = []
vals = []
for point in gts['v']:
tss.append(point[0])
vals.append(point[-1])
p.line(x=tss, y=vals, legend_label=gts['c'] + json.dumps(gts['l']), color=palette[gts_index % len(palette)])
st.bokeh_chart(p, use_container_width=True)
# Also display the json.
st.json(json_data)
I need to download multiple 10-ks documents, however, this code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function]. Here's code.
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
from tqdm import tqdm
Here's the py file that includes project_helper functions.
import matplotlib.pyplot as plt
import requests
from ratelimit import limits, sleep_and_retry
class SecAPI(object):
SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#staticmethod
#sleep_and_retry
# Dividing the call limit by half to avoid coming close to the limit
#limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
def _call_sec(url):
return requests.get(url)
def get(self, url):
return self._call_sec(url).text
def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
indentation = ' '
print('[')
for ten_k in ten_k_data:
print_statement = '{}{{'.format(indentation)
for field in fields:
value = str(ten_k[field])
# Show return lines in output
if isinstance(value, str):
value_str = '\'{}\''.format(value.replace('\n', '\\n'))
else:
value_str = str(value)
# Cut off the string if it gets too long
if len(value_str) > field_length_limit:
value_str = value_str[:field_length_limit] + '...'
print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)
print_statement += '},'
print(print_statement)
print(']')
The first step it to download NLP Corpora.
nltk.download('stopwords')
nltk.download('wordnet')
Than Get 10ks
#cik_lookup = {
# 'GOOGL':'0001288776',
# 'AAPL':'0000320193',
# 'FACEBOOK':'0001326801',
# 'AMZN':'0001018724',
# 'MSFT':'0000789019'}
cik_lookup = {
'AEP': '0000004904',
'AXP': '0000004962',
'BA': '0000012927',
'BK': '0001390777',
'CAT': '0000018230',
'DE': '0000315189',
'DIS': '0001001039',
'DTE': '0000936340',
'ED': '0001047862',
'EMR': '0000032604',
'ETN': '0001551182',
'GE': '0000040545',
'IBM': '0000051143',
'IP': '0000051434',
'JNJ': '0000200406',
'KO': '0000021344',
'LLY': '0000059478',
'MCD': '0000063908',
'MO': '0000764180',
'MRK': '0000310158',
'MRO': '0000101778',
'PCG': '0001004980',
'PEP': '0000077476',
'PFE': '0000078003',
'PG': '0000080424',
'PNR': '0000077360',
'SYY': '0000096021',
'TXN': '0000097476',
'UTX': '0000101829',
'WFC': '0000072971',
'WMT': '0000104169',
'WY': '0000106535',
'XOM': '0000034088'}
Get list of 10-ks
sec_api = project_helper.SecAPI()
from bs4 import BeautifulSoup
def get_sec_data(cik, doc_type, start=0, count=60):
newest_pricing_data = pd.to_datetime('2021-01-01')
rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
'&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
.format(cik, doc_type, start, count)
sec_data = sec_api.get(rss_url)
feed = BeautifulSoup(sec_data.encode('utf-8'), 'xml').feed
entries = [
(
entry.content.find('filing-href').getText(),
entry.content.find('filing-type').getText(),
entry.content.find('filing-date').getText())
for entry in feed.find_all('entry', recursive=False)
if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
return entries
example_ticker = 'AEP'
sec_data = {}
for ticker, cik in cik_lookup.items():
sec_data[ticker] = get_sec_data(cik, '10-K')
The code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function] I get the following error. The first error I got is as below.
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-8-28a784054794> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-8-28a784054794> in get_sec_data(cik, doc_type, start, count)
5 rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' .format(cik, doc_type, start, count)
6 sec_data = sec_api.get(rss_url)
----> 7 feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
8 entries = [
9 (
UnicodeEncodeError: 'ascii' codec can't encode characters in position 2599-2601: ordinal not in range(128)
However, after some google search over BeutifulSoup(ecodes) I changed it to utf-8 and then got the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-9c77ed07af2d> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-9-9c77ed07af2d> in get_sec_data(cik, doc_type, start, count)
11 entry.content.find('filing-type').getText(),
12 entry.content.find('filing-date').getText())
---> 13 for entry in feed.find_all('entry', recursive=False)
14 if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
15
AttributeError: 'NoneType' object has no attribute 'find_all'
The project can be accessed here at the following github repo.
github repo herealso.