Download multiple 10-ks documents - edgar

I need to download multiple 10-ks documents, however, this code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function]. Here's code.
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
from tqdm import tqdm
Here's the py file that includes project_helper functions.
import matplotlib.pyplot as plt
import requests
from ratelimit import limits, sleep_and_retry
class SecAPI(object):
SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#staticmethod
#sleep_and_retry
# Dividing the call limit by half to avoid coming close to the limit
#limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
def _call_sec(url):
return requests.get(url)
def get(self, url):
return self._call_sec(url).text
def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
indentation = ' '
print('[')
for ten_k in ten_k_data:
print_statement = '{}{{'.format(indentation)
for field in fields:
value = str(ten_k[field])
# Show return lines in output
if isinstance(value, str):
value_str = '\'{}\''.format(value.replace('\n', '\\n'))
else:
value_str = str(value)
# Cut off the string if it gets too long
if len(value_str) > field_length_limit:
value_str = value_str[:field_length_limit] + '...'
print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)
print_statement += '},'
print(print_statement)
print(']')
The first step it to download NLP Corpora.
nltk.download('stopwords')
nltk.download('wordnet')
Than Get 10ks
#cik_lookup = {
# 'GOOGL':'0001288776',
# 'AAPL':'0000320193',
# 'FACEBOOK':'0001326801',
# 'AMZN':'0001018724',
# 'MSFT':'0000789019'}
cik_lookup = {
'AEP': '0000004904',
'AXP': '0000004962',
'BA': '0000012927',
'BK': '0001390777',
'CAT': '0000018230',
'DE': '0000315189',
'DIS': '0001001039',
'DTE': '0000936340',
'ED': '0001047862',
'EMR': '0000032604',
'ETN': '0001551182',
'GE': '0000040545',
'IBM': '0000051143',
'IP': '0000051434',
'JNJ': '0000200406',
'KO': '0000021344',
'LLY': '0000059478',
'MCD': '0000063908',
'MO': '0000764180',
'MRK': '0000310158',
'MRO': '0000101778',
'PCG': '0001004980',
'PEP': '0000077476',
'PFE': '0000078003',
'PG': '0000080424',
'PNR': '0000077360',
'SYY': '0000096021',
'TXN': '0000097476',
'UTX': '0000101829',
'WFC': '0000072971',
'WMT': '0000104169',
'WY': '0000106535',
'XOM': '0000034088'}
Get list of 10-ks
sec_api = project_helper.SecAPI()
from bs4 import BeautifulSoup
def get_sec_data(cik, doc_type, start=0, count=60):
newest_pricing_data = pd.to_datetime('2021-01-01')
rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
'&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
.format(cik, doc_type, start, count)
sec_data = sec_api.get(rss_url)
feed = BeautifulSoup(sec_data.encode('utf-8'), 'xml').feed
entries = [
(
entry.content.find('filing-href').getText(),
entry.content.find('filing-type').getText(),
entry.content.find('filing-date').getText())
for entry in feed.find_all('entry', recursive=False)
if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
return entries
example_ticker = 'AEP'
sec_data = {}
for ticker, cik in cik_lookup.items():
sec_data[ticker] = get_sec_data(cik, '10-K')
The code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function] I get the following error. The first error I got is as below.
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-8-28a784054794> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-8-28a784054794> in get_sec_data(cik, doc_type, start, count)
5 rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' .format(cik, doc_type, start, count)
6 sec_data = sec_api.get(rss_url)
----> 7 feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
8 entries = [
9 (
UnicodeEncodeError: 'ascii' codec can't encode characters in position 2599-2601: ordinal not in range(128)
However, after some google search over BeutifulSoup(ecodes) I changed it to utf-8 and then got the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-9c77ed07af2d> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-9-9c77ed07af2d> in get_sec_data(cik, doc_type, start, count)
11 entry.content.find('filing-type').getText(),
12 entry.content.find('filing-date').getText())
---> 13 for entry in feed.find_all('entry', recursive=False)
14 if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
15
AttributeError: 'NoneType' object has no attribute 'find_all'
The project can be accessed here at the following github repo.
github repo herealso.

Related

How can I use Google Cloud Functions to run a web scraper?

Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.

Sensor return value can not be stored/retreived using PokeReturnValue

Below code creates the dag (the graph is also attached) which contains 2 PythonSensors and a PythonOperator.
First Sensors creates a random integer list as data and a random boolean with 50% chance of success. It logs generated values and returns PokeReturnValue
Second sensor and Python operator both tries to get data from xcom and log them.
Graph of DAG
# region IMPORTS
import random
import logging
from datetime import datetime, timedelta
from airflow import DAG
from heliocampus.configuration.constants import Constants
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
from airflow.sensors.python import PythonSensor
from airflow.sensors.base import PokeReturnValue
from airflow.utils.trigger_rule import TriggerRule
from box import Box
# endregion
# region configuration
constants = Constants()
dagconfig = Box({ "Code":"Test" })
# endregion
def main() -> DAG:
# region default_args
args = dict()
args['start_date'] = datetime(2021, 1, 1)
# endregion
with DAG(dag_id=dagconfig.Code, schedule_interval="#once", default_args=args, tags=['test', 'V0.1.4']) as dag:
start = EmptyOperator(task_id="start")
# region Sensors
check_all_expired_tables = PythonSensor(
task_id="CHECK_ALL_EXPIRED_TABLES",
poke_interval=timedelta(seconds=20).total_seconds(),
timeout=timedelta(minutes=1).total_seconds(),
mode="reschedule",
python_callable=check_expired_tables,
trigger_rule=TriggerRule.ALL_SUCCESS
)
check_all_expired_tables_notification = PythonOperator(
task_id="CHECK_ALL_EXPIRED_TABLES_NOTIFICATION",
python_callable=sensor_result_nofitication,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_FAILED
)
verify_ods_operator = PythonSensor(
task_id="VERIFY_ODS",
poke_interval=timedelta(seconds=30).total_seconds(),
timeout=timedelta(hours=2).total_seconds(),
mode="reschedule",
python_callable=verify_ods,
op_kwargs={"notification_source":"CHECK_ALL_EXPIRED_TABLES"},
trigger_rule=TriggerRule.ALL_SUCCESS
)
# endregion
end = EmptyOperator(task_id="end")
start >> check_all_expired_tables >> verify_ods_operator >> end
check_all_expired_tables >> check_all_expired_tables_notification
return dag
# region Notifications
def sensor_result_nofitication(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"sensor_result_nofitication : Sensor return_value from {notification_source} is {actual_xcom_value}")
# endregion
def check_expired_tables():
randomlist = random.sample(range(10, 30), 5)
randomResult = random.randint(0, 100) > 50
logging.info(f"check_expired_tables : returning PokeReturnValue(is_done={randomResult}, xcom_value={randomlist})")
return PokeReturnValue(is_done=randomResult, xcom_value=randomlist)
def verify_ods(ti, notification_source):
actual_xcom_value = ti.xcom_pull(task_ids=[notification_source])
logging.info(f"verify_ods : Sensor without key from {notification_source} is {actual_xcom_value}")
actual_xcom_value = ti.xcom_pull(key='return_value', task_ids=[notification_source])
logging.info(f"verify_ods : Sensor return_value from {notification_source} is {actual_xcom_value}")
rnd = random.randint(0, 100)
logging.info("Random Number : {num}".format(num=rnd))
return (rnd > 20)
main()
Regardless of whether the first sensor is successfull or not the data from xcom can not be logged in the second sensor or python operator.
I don't know if the problem is on the pushing side or pulling side.
I can not see any rows inserted in airflow database (xcom table).
The problem lives in the PythonSensor which is coercing the return of the python callable to boolean without checking its type first:
return_value = self.python_callable(*self.op_args, **self.op_kwargs)
return PokeReturnValue(bool(return_value))
To get the expected behavior something like this needs to be added to the PythonSensor:
return return_value if isinstance(return_value, PokeReturnValue) else PokeReturnValue(bool(return_value)

Warp10 and streamlit integration?

Two simple questions:
Does Warp10 integrate into streamlit to feed visualisations?
If so, please would you specify how this can be accomplished?
Thanking you in advance.
Best wishes,
There's no direct integration of Warp 10 in streamlit.
Although streamlit can handle any kind of data, it's mainly focused on pandas DataFrame. DataFrames are tables whereas Warp 10 Geo Time Series are time series. So even if Warp 10 was integrated in streamlit, it would require some code to properly format the data for streamlit to give its full potential.
That being said, here is a small example on how to display data stored in Warp 10 with streamlit:
import json
from datetime import datetime, timedelta
import requests
import streamlit as st
from bokeh.palettes import Category10_10 as palette
from bokeh.plotting import figure
# Should be put in a configuration file.
fetch_endpoint = 'http://localhost:8080/api/v0/fetch'
token = 'READ' # Change that to your actual token
def load_data_as_json(selector, start, end):
headers = {'X-Warp10-Token': token}
params = {'selector': selector, 'start': start, 'end': end, 'format': 'json'}
r = requests.get(fetch_endpoint, params=params, headers=headers)
return r.text
st.title('Warp 10 Test')
# Input parameters
selector = st.text_input('Selector', value="~streamlit.*{}")
start_date = st.date_input('Start date', value=datetime.now() - timedelta(days=10))
start_time = st.time_input('Start time')
end_date = st.date_input('End date')
end_time = st.time_input('End time')
# Convert datetime.dates and datetime.times to microseconds (default time unit in Warp 10)
start = int(datetime.combine(start_date, start_time).timestamp()) * 1000000
end = int(datetime.combine(end_date, end_time).timestamp()) * 1000000
# Make the query to Warp 10 and get back a json.
json_data = load_data_as_json(selector, start, end)
gtss = json.loads(json_data)
# Iterate through the json and populate a Bokeh graph.
p = figure(title='GTSs', x_axis_label='time', y_axis_label='value')
for gts_index, gts in enumerate(gtss):
tss = []
vals = []
for point in gts['v']:
tss.append(point[0])
vals.append(point[-1])
p.line(x=tss, y=vals, legend_label=gts['c'] + json.dumps(gts['l']), color=palette[gts_index % len(palette)])
st.bokeh_chart(p, use_container_width=True)
# Also display the json.
st.json(json_data)

I'm Getting error json.decoder.JSONDecodeError: while running a python code

I have got this code from internet, for extracting data from justdial website.
While running this code I got the following error:
ERROR:json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) is shown.
Please help me to run this code as i'm not familiar with python. What changes should be done to run this code step by step.
Thank you in advance.
Here is my code:
import csv
import json
import requests
from bs4 import BeautifulSoup
print(25*"=")
print("Just Dial Scraper")
print(25*"=")
url = 'http://www.justdial.com/functions/ajxsearch.php?national_search=0&act'\
'=pagination&city={0}&search={1}&page={2}'
what = input("Enter your Query: ")
what = what.replace(' ', '+')
where = input("Enter the Location: ")
with open(what+"_"+where+'.csv', 'w') as f:
f.write('company, address, phone\n')
page = 1
while True:
print('Scraping Page', page)
resp = requests.get(url.format(where, what, page))
if not resp.json()['paidDocIds']:
print(25*"-")
print('Scraping Finished')
print(25*"-")
break
markup = resp.json()['markup'].replace('\/', '/')
soup = BeautifulSoup(markup, 'html.parser')
for thing in soup.find_all('section'):
csv_list = []
if thing.get('class') == ['jcar']:
# Company name
for a_tag in thing.find_all('a'):
if a_tag.get('onclick') == "_ct('clntnm', 'lspg');":
csv_list.append(a_tag.get('title'))
# Address
for span_tag in thing.find_all('span'):
if span_tag.get('class') == ['mrehover', 'dn']:
csv_list.append(span_tag.get_text().strip())
# Phone number
for a_tag in thing.find_all('a'):
if a_tag.get('href').startswith('tel:'):
csv_list.append(a_tag.get('href').split(':')[-1])
csv_list = ['"'+item+'"' for item in csv_list]
writeline = ','.join(csv_list)+'\n'
f.write(','.join(csv_list)+'\n')
page += 1

Error in mapping DataFrame for keyword in ipython

I have collected tweets and wish to add a column named taliban that maps (true/false) for the word (Taliban/taliban) present in the tweet. Please see the code and its error.
tweets = pd.DataFrame()
from pandas.io.json import json_normalize
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", \
"coordinates", "user.location"]]
The code to define word is as follows:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
Then I run this query:
tweets['Taliban'] = tweets['text'].apply(lambda tweet: word_in_text('Taliban', tweet))
I get the following error:
AttributeError Traceback (most recent call last)
<ipython-input-16-17bcf7fbf866> in <module>()
----> 1 tweets['Taliban'] = tweets['text'].apply(lambda tweet: word_in_text('Taliban', tweet))
/usr/lib64/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
2167 values = lib.map_infer(values, lib.Timestamp)
2168
-> 2169 mapped = lib.map_infer(values, f, convert=convert_dtype)
2170 if len(mapped) and isinstance(mapped[0], Series):
2171 from pandas.core.frame import DataFrame
pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:62578)()
<ipython-input-16-17bcf7fbf866> in <lambda>(tweet)
----> 1 tweets['Taliban'] = tweets['text'].apply(lambda tweet: word_in_text('Taliban', tweet))
<ipython-input-15-56228996ad5c> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
AttributeError: 'float' object has no attribute 'lower'
In [ ]:
I have tried this modification:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
if type(text) is str:
return text.lower()
match = re.search(word, text)
if match:
return True
return False
to be used in:
tweets['taliban'] = tweets['text'].apply(lambda tweet: word_in_text('taliban', tweet))
But the error given is:
UnboundLocalError: local variable 'match' referenced before assignment

Resources