What Caused the Python NoneType Error During My Splinter 'click()' Call? - web-scraping

When trying to scrape the county data from multiple Politico state web pages, such as this one, I concluded the best method was to first click the button that expands the county list before grabbing the table body's data (when present). However, my attempt at clicking the button had failed:
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
state_page_url = "https://www.politico.com/2020-election/results/washington/"
executable_path = {'executable_path': 'chrome-driver/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(state_page_url)
state_soup = bs(browser.html, 'html.parser')
reveal_button = state_soup.find('button', class_='jsx-3713440361')
if (reveal_button == None):
# Steps to take when the button isn't present
# ...
else:
reveal_button.click()
The error returned when following the else-condition is for my click() call: "TypeError: NoneType object is not callable". This doesn't make sense to me since I thought that the if-statement implied the reveal_button was not a NoneType. Am I misinterpeting the error message, how the reveal_button was set or am I misinterpeting what I'm working with after making state_soup?

Based on the comment thread for the question, and this solution to a similar question, I came across the following fix:
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
# Navigate the page to click the desired button
state_page_url = "https://www.politico.com/2020-election/results/alabama/"
driver = webdriver.Chrome(executable_path='chrome-driver/chromedriver.exe')
driver.get(state_page_url)
button_list = driver.find_elements(By.CLASS_NAME, 'jsx-3713440361')
if button_list == []:
# Actions to take when no button is found
# ...
else:
button_list[-1].click() # The index was determined through trial/error specific to the web page
# Now to grab the table and its data
state_soup = bs(driver.page_source)
state_county_results_table = state_soup.find('tbody', class_='jsx-3713440361')
Note that it required selenium for navigation and interaction while BeautifulSoup4 was used to parse it for the information I'd need

Related

Scraping: No attribute find_all for <p>

Good morning :)
I am trying to scrape the content of this website: https://public.era.nih.gov/pubroster/preRosIndex.era?AGENDA=438653&CID=102313
The text I am trying to get seems to be located inside some <p> and separated by <br>.
For some reason, whenever I try to access a <p>, I get the following mistake: "ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?", and this even if I do find instead of find_all().
My code is below (it is a very simple thing with no loop yet, I just would like to identify where the mistake comes from):
from selenium import webdriver
import time
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(executable_path='MYPATH/chromedriver',options=options)
url= "https://public.era.nih.gov/pubroster/preRosIndex.era?AGENDA=446096&CID=102313"
driver.maximize_window()
driver.implicitly_wait(5) # wait up to 3 seconds before calls to find elements time out
driver.get(url)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
column = soup.find_all("div", class_="col-sm-12")
people_in_column = column.find_all("p").find_all("br")
Is there anything obvious I am not understanding here?
Thanks a lot in advance for your help!
You are trying to select a lsit of items aka ResultSet multiples times which is incorrect meaning using find_all method two times but not iterating.The correct way is as follows. Hope, it should work.
columns = soup.find_all("div", class_="col-sm-12")
for column in columns:
people_in_column = column.find("p").get_text(strip=True)
print(people_in_column)
Full working code as an example:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
options = Options()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
url= "https://public.era.nih.gov/pubroster/preRosIndex.era?AGENDA=446096&CID=102313"
driver.maximize_window()
driver.implicitly_wait(5) # wait up to 3 seconds before calls to find elements time out
driver.get(url)
content = driver.page_source#.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
columns = soup.find_all("div", class_="col-sm-12")
for column in columns:
people_in_column = column.find("p").get_text(strip=True)
print(people_in_column)
Output:
Notice of NIH Policy to All Applicants:Meeting rosters are provided for information purposes only. Applicant investigators and institutional officials must not communicate directly with study section members about an application before or after the review. Failure to observe this policy will create a serious breach of integrity in the peer review process, and may lead to actions outlined inNOT-OD-22-044, including removal of the application from immediate
review.

problem with event loops, gui qt5, and ipywidgets in a jupyter notebook

I am trying to integrate interactive ipywidgets with a loop in my code that also performs other tasks (in this case, acquiring data from some hardware attached from the computer and updating live plots).
In the past, I could do this by using IPython.kernel.do_one_iteration() in my while loop: this would trigger a sync of the ipywidget changes and I would be able to retrieve them from the python widget objects. A minimal example is here:
import ipywidgets as widgets
from time import sleep
import IPython
do_one_iteration = IPython.get_ipython().kernel.do_one_iteration
w = widgets.ToggleButton()
display(w)
i=0
while True:
do_one_iteration()
print(i, w.value, end="\r")
w.decription = str(i)
sleep(0.5)
i+=1
Here, the for loop prints out the ticker integer along with the state of the widget. (In the real code, I would also acquire data, update plots, and change plot / acquisition settings dependent on the interaction with the user via the widgets.)
With ipykernel 5.3.2 and ipython 7.16.1, this worked fine: if the widget changed, calling do_one_iteration() synced the widget states to the kernel and I could retrieve it from my while loop.
After an upgrade (to 6.4.1 and 7.29.0), this no longer works. It seems that do_one_iteration() is now a coroutine: I get a warning coroutine 'Kernel.do_one_iteration' was never awaited if I use the above code.
With some help of a friend, we found a way to do this with threading an asyncio:
%gui asyncio
import asyncio
import ipywidgets as widgets
button = widgets.ToggleButton()
display(button)
text = widgets.Text()
display(text)
text.value= str(button.value)
stop_button = widgets.ToggleButton()
stop_button.description = "Stop"
display(stop_button)
async def f():
i=0
while True:
i += 1
text.value = str(i) + " " + str(button.value)
await asyncio.sleep(0.2)
if stop_button.value == True:
return
asyncio.create_task(f());
And this works (also adding a stop button, and changing to text output widget instead of printing). But to throw a spanner in the works, I need to use a library that itself uses a QT gui event loop. Some more puzzling suggests that this should be the code to make this work:
%gui qt5
import asyncio
import ipywidgets as widgets
import qasync
button = widgets.ToggleButton()
display(button)
text = widgets.Text()
display(text)
text.value= str(button.value)
stop_button = widgets.ToggleButton()
stop_button.description = "Stop"
display(stop_button)
async def f():
while True:
i += 1
text.value = str(i) + " " + str(button.value)
await asyncio.sleep(0.2)
if stop_button.value == True:
return
from qtpy import QtWidgets
APP = QtWidgets.QApplication.instance()
loop = qasync.QEventLoop(APP)
asyncio.set_event_loop(loop)
asyncio.create_task(f());
But with this code, the updates do not propagate, and I get the following error on the terminal running my notebook server:
[IPKernelApp] ERROR | Error in message handler
Traceback (most recent call last):
File "/Users/gsteele/anaconda3/envs/myenv2/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
await self.process_one()
File "/Users/gsteele/anaconda3/envs/myenv2/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 440, in process_one
t, dispatch, args = await self.msg_queue.get()
RuntimeError: Task <Task pending name='Task-2'
coro=<Kernel.dispatch_queue() running at
/Users/gsteele/anaconda3/envs/myenv2/lib/python3.9/site-packages/ipykernel/kernelbase.py:457>
cb=[IOLoop.add_future.<locals>.<lambda>() at /Users/gsteele/anaconda3/envs/myenv2/lib/python3.9/site-packages/tornado/ioloop.py:688]>
got Future <Future pending> attached to a different loop
It seems that somehow my ipywidgets events are propagating to the wrong event loop.
And now my question is: does anybody know what is going on here?
It's hard for me to identify if this is a "bug", and if so, in which software package do things go wrong? ipykernel? Or tornado? Or ipywidgets? Or asyncio? Or maybe I'm missing something?
Any thoughts highly welcome, thanks!
Found at least a partial solution: using the nest_asyncio package allows me to now use do_one_iteration(), just by adding the following to the first code block:
import nest_asyncio
nest_asyncio.apply()
and then using await do_one_iteration() instead of calling it directly.
(see https://github.com/ipython/ipykernel/issues/825)
For my purposes, this solves my issue since I don't need asynchronous interaction with my GUI. The problem of the %gui qt5 interaction with the event loop in the asynchronous versions of the code is still a mystery though...

Multiple classes, unable to return desired page(s)

first want to say that I am a first time poster so I am sorry in advance if any parts of my question or the way it is asked/presented "sucks." With that being said, I've been trying to scrape a table from barchart.com use jupyter and beautifulsoup that is on multiple pages and while I have been successful in returning the entire page as a whole, I haven't had much luck trying to return the specific pages I need. I did include some images, the first three of which reference the elements that I am currently "choosing" from to use:
the 'div' element that highlights the entire table
another 'div' element within the first 'div' that also has the entire table I need
The 'table' element that I would use but it doesn't include the left most column that includes the tickers/stock symbols
Regardless of what I have tried to put in my code, I always get a "[]" back and haven't been able to figure out how to write the multiple parts of each 'div' or 'table', if that makes sense.
Code pic
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen, Request
stonks_url = Request('https://www.barchart.com/options/unusual-activity/stocks', headers={'User-Agent': 'Mozilla/5.0'})
stonks_data = urlopen(stonks_url)
stonks_html = stonks_data.read()
stonks_data.close()
page_soup = soup(stonks_html, 'html.parser')
uoa_table = page_soup.findAll('tbody', {'data-ng-repeat': 'rows in content'})
print(uoa_table)
Thanks in advance to any advice or guidance!
As this page is not working with javascript request you need to use the selenium and get the page source of the page and use it for processing the table
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from selenium import webdriver
driver= webdriver.Chrome()
driver.get('https://www.barchart.com/options/unusual-activity/stocks')
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get text
text = soup.get_text()
print(text)

How to update payload info for python scraping

I have a python scraper that works for this site:
https://dhhr.wv.gov/COVID-19/Pages/default.aspx
It will scrape the tooltips from one of the graphs that is navigated to by clicking the "Positive Case Trends" link in the above URL.
here is my code:
import re
import requests
import json
from datetime import date
url4 = 'https://wabi-us-gov-virginia-api.analysis.usgovcloudapi.net/public/reports/querydata?synchronous=true'
# payload:
x=r'{"version":"1.0.0","queries":[{"Query":{"Commands":[{"SemanticQueryDataShapeCommand":{"Query":{"Version":2,"From":[{"Name":"c","Entity":"Case Data"}],"Select":[{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Lab Report Date"},"Name":"Case Data.Lab Add Date"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Daily Confirmed Cases"}},"Function":0},"Name":"Sum(Case Data.Daily Confirmed Cases)"},{"Aggregation":{"Expression":{"Column":{"Expression":{"SourceRef":{"Source":"c"}},"Property":"Daily Probable Cases"}},"Function":0},"Name":"Sum(Case Data.Daily Probable Cases)"}]},"Binding":{"Primary":{"Groupings":[{"Projections":[0,1,2]}]},"DataReduction":{"DataVolume":4,"Primary":{"BinnedLineSample":{}}},"Version":1}}}]},"CacheKey":"{\"Commands\":[{\"SemanticQueryDataShapeCommand\":{\"Query\":{\"Version\":2,\"From\":[{\"Name\":\"c\",\"Entity\":\"Case Data\"}],\"Select\":[{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Lab Report Date\"},\"Name\":\"Case Data.Lab Add Date\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Daily Confirmed Cases\"}},\"Function\":0},\"Name\":\"Sum(Case Data.Daily Confirmed Cases)\"},{\"Aggregation\":{\"Expression\":{\"Column\":{\"Expression\":{\"SourceRef\":{\"Source\":\"c\"}},\"Property\":\"Daily Probable Cases\"}},\"Function\":0},\"Name\":\"Sum(Case Data.Daily Probable Cases)\"}]},\"Binding\":{\"Primary\":{\"Groupings\":[{\"Projections\":[0,1,2]}]},\"DataReduction\":{\"DataVolume\":4,\"Primary\":{\"BinnedLineSample\":{}}},\"Version\":1}}}]}","QueryId":"","ApplicationContext":{"DatasetId":"fb9b182d-de95-4d65-9aba-3e505de8eb75","Sources":[{"ReportId":"dbabbc9f-cc0d-4dd0-827f-5d25eeca98f6"}]}}],"cancelQueries":[],"modelId":339580}'
x=x.replace("\\\'","'")
json_data = json.loads(x)
final_data2 = requests.post(url4, json=json_data, headers={'X-PowerBI-ResourceKey': 'ab4e5874-7bbf-44c9-9443-0701abdee612'}).json()
print(json.dumps(final_data2))
The issue is that some days it stops working because the payload and X-PowerBI-ResourceKey header parameter values change and i have to find and manually copy and paste the new values from browser inspection network section into my source. Is there a way to programatically obtain these from the webpage and construct them in my code?
I'm pretty sure the resource key is part of the iframe url encoded as base64.
from base64 import b64decode
from bs4 import BeautifulSoup
import json
import requests
resp = requests.get('https://dhhr.wv.gov/COVID-19/Pages/default.aspx')
soup = BeautifulSoup(resp.text)
data = soup.find_all('iframe')[0]['src'].split('=').pop()
decoded = json.loads(b64decode(data).decode())

OSError Err 22 [Invalid Argument] when scraping NASA website using python script

I've just put together a little scraping script within my Windows environment in order to download all pictures from the associated url's within the main index url. A few pictures download into the subdirectory but the code dumps out with the error:
"OSError: [Errno 22] Invalid argument:
'apod_pictures\start?type=1x1'"
What am I doing wrong? Any help would be appreciated.
Thanks,
Alun.
import os
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
# Download index page
base_url = "https://apod.nasa.gov/apod/astropix.html"
download_directory = "apod_pictures"
content = urllib.request.urlopen(base_url).read()
# For each link on the index page
for link in BeautifulSoup(content, "lxml").findAll("a"):
print("Following link:", link)
href = urljoin(base_url, link["href"])
# follow the link and pull down the image held on that page
content = urllib.request.urlopen(href).read()
for img in BeautifulSoup(content, "lxml").findAll("img"):
img_href = urljoin(href, img["src"])
print("Downloading image:", img_href)
img_name = img_href.split("/")[-1]
"".join(x for x in img_name if x.isalnum())
"".join(x for x in download_directory if x.isalnum())
urllib.request.urlretrieve(img_href, `os.path.join(download_directory, img_name))`
"OSError: [Errno 22] Invalid argument: 'apod_pictures\\start?type=1x1'"
When I first saw the issue, I tried inserting the lines of code seen above and below, which didn't seem to correct things:
"".join(x for x in img_name if x.isalnum())
"".join(x for x in download_directory if x.isalnum())

Resources