Beautifulsoup ':-soup-contains' pseudo-class is not implemented at this time - web-scraping

Whenever I run this code, I get the pseudo-class is not implemented error. I found this code online and I am trying to scrape the relevant information about the cities from Wikipedia.
I have updated python and beautiful soup to their most recent versions. Any help is greatly appreciated.
import requests
import bs4
from bs4 import BeautifulSoup as bs
import pandas as pd
import unicodedata
import re
# cities = ['Berlin', 'Hamburg', 'Frankfurt','Munich','Stuttgart','Leipzig','Cologne','Dresden','Hannover','Paris', 'Barcelona','Lisbon','Madrid']
cities = ['Berlin','Paris','Amsterdam','Barcelona','Rome','Lisbon','Prague','Vienna','Madrid']
def City_info(soup):
ret_dict = {}
ret_dict['city'] = soup.h1.get_text()
if soup.select_one('.mergedrow:-soup-contains("Mayor")>.infobox-label') != None:
i = soup.select_one('.mergedrow:-soup-contains("Mayor")>.infobox-label')
mayor_name_html = i.find_next_sibling()
mayor_name = unicodedata.normalize('NFKD',mayor_name_html.get_text())
ret_dict['mayor'] = mayor_name
if soup.select_one('.mergedrow:-soup-contains("City")>.infobox-label') != None:
j = soup.select_one('.mergedrow:-soup-contains("City")>.infobox-label')
area = j.find_next_sibling('td').get_text()
ret_dict['city_size'] = unicodedata.normalize('NFKD',area)
if soup.select_one('.mergedtoprow:-soup-contains("Elevation")>.infobox-data') != None:
k = soup.select_one('.mergedtoprow:-soup-contains("Elevation")>.infobox-data')
elevation_html = k.get_text()
ret_dict['elevation'] = unicodedata.normalize('NFKD',elevation_html)
if soup.select_one('.mergedtoprow:-soup-contains("Population")') != None:
l = soup.select_one('.mergedtoprow:-soup-contains("Population")')
c_pop = l.findNext('td').get_text()
ret_dict['city_population'] = c_pop
if soup.select_one('.infobox-label>[title^=Urban]') != None:
m = soup.select_one('.infobox-label>[title^=Urban]')
u_pop = m.findNext('td')
ret_dict['urban_population'] = u_pop.get_text()
if soup.select_one('.infobox-label>[title^=Metro]') != None:
n = soup.select_one('.infobox-label>[title^=Metro]')
m_pop = n.findNext('td')
ret_dict['metro_population'] = m_pop.get_text()
if soup.select_one('.latitude') != None:
o = soup.select_one('.latitude')
ret_dict['lat'] = o.get_text()
if soup.select_one('.longitude') != None:
p = soup.select_one('.longitude')
ret_dict['long'] = p.get_text()
return ret_dict
list_of_city_info = []
for city in cities:
url = 'https://en.wikipedia.org/wiki/{}'.format(city)
web = requests.get(url,'html.parser')
soup = bs(web.content)
list_of_city_info.append(City_info(soup))
df_cities = pd.DataFrame(list_of_city_info)
df_cities = df_cities.set_index('city')
df_cities
I have not found any solutions for this unfortunately.

:-soup-contains is a css pseudo class selector to target a node's text.
It comes with Soup Sieve that is the official CSS select implementation of Beautiful Soup 4.7.0+, so for most people, using Beautiful Soup 4.7.0+ your script should work fine.
So first check if your version is up to date in older version deprecated form of :contains() is used.

Related

Data extraction through webscraping

Can someone please guide me as to how I can go about extracting data from this particular table? I have tried it multiple times but have not succeeded in extracting the required data.
`import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://etfdb.com/etf/ICLN/#fact-sheet', proxies = proxy_support).text
soup = bs(r,'html.parser')
da = soup.find_all('ul', {'class':'list-unstyled'})[0]
n_rows = 0
n_columns = 0
column_names = []
for row in da.find_all('li'):
td_tags = row.find('span')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all('a href')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns, index= range(0,n_rows))
row_marker = 0
for row in da.find_all('li'):
column_marker = 0
columns = row.find_all('span')
for column in columns:
df.iat[row_marker,column_marker] = columns.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
For the code above I get the following error :
AttributeError: ResultSet object has no attribute 'get_text'. You're
probably treating a list of items like a single item. Did you call
find_all() when you meant to call find()?
Can any one tell me what I am doing wrong?
With bs4 4.7.1. to get first table
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://etfdb.com/etf/ICLN/#fact-sheet')
soup = bs(r.content, 'lxml')
items = soup.select('h3:contains(Vitals) + ul li')
for item in items:
print([i.text for i in item.select('span')])
Earlier bs versions
items = soup.select_one('h3 + ul').select('li')
for item in items:
print([i.text for i in item.select('span')])

Bs4: Trying to loop in diferent arrays with diferent lenghts. Get IndexError: list index out of range

With Beautifulsoup4 and python3.7 I'm trying to loop some arrays with links. After, want to get some text from tags. But I'm encountering and error passing the code on the terminal.
Here the code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
import csv
my_url = "http://www.example.com"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
links = page_soup.select('dt > a[href]')
link = [tag.get('href') for tag in links]
i = 0
for i in range(0, 5000):
url = link[i]
Client = uReq(url)
pageHtml = Client.read()
Client.close()
pSoup = soup(pageHtml, "html.parser")
linkeas = pSoup.findAll(href=re.compile(my_url))
def linkas(href):
return href and re.compile("html").search(href) and re.compile(my_url).search(href)
linka = pSoup.findAll(href=linkas)
if linka != []:
linkia = [tag.get('href') for tag in linka]
linko = len(linkia)
j = 0
for j in range (0, linko):
curl = linkia[j]
cClient = uReq(curl)
pageHtml = cClient.read()
cClient.close()
Soup = soup(page_html, "html.parser")
country = Soup.select('.class > a:nth-of-type(3)')
countri = country[0].text.strip()
print(countri)
I've tried for days several ways but got so far as this with no results:
Traceback (most recent call last):
File "<stdin>", line 22, in <module>
IndexError: list index out of range
Could someone give some tip?
NOTE:
Arrays show like this:
print(linkia)
['http://www.example/example/1.html']
['http://www.example/example/2.html']
['http://www.example/example/3.html', 'http://www.example/example/4.html',
'http://www.example/example/5.html', 'http://www.example/example/6.html',
'http://www.example/example/7.html', 'http://www.example/example/8.html',
'http://www.example/example/9.html', 'http://www.example/example/10.html',
'http://www.example/example/11.html', 'http://www.example/example/12.html',
'http://www.example/example/13.html', 'http://www.example/example/14.html',
'http://www.example/example/15.html', 'http://www.example/example/16.html',
'http://www.example/example/17.html', 'http://www.example/example/18.html',
'http://www.example/example/19.html']
['http://www.example/example/20.html', 'http://www.example/example/example/21.html',
'http://www.example/example/example/22.html']
['http://www.example/example/23.html']
Thanks a lot for your time. Really appreciate. Will be connected all time with fast response.
change:
i = 0
for i in range(0, 5000):
url = link[i]
to just:
for url in link:
And then can get rid of the url = link[i]
You're essentially telling it to loop through 5000 items in your list, when you don't have 5000 items, hence the list index out of range. You really just want it to loop through each element until it runs out of items. And you can do that by simply saying for url in link:
Then the same for your other nested for loop.
change:
j = 0
for j in range (0, linko):
curl = linkia[j]
to:
for curl in linkia:
I will also note that if you were to set it up the way you have it, you wouldn't need to set the initial i or j to be = 0. Since you set the range/list to go from 0, 5000...the for loop would automatically start at that first element of 0. But again, that point is irrelevant, as I would not recommend iterating through your list like that. It a) isn't robust (you would need exactly 5000 items in your list every time it gets to that loop), and b) while it would work ok for your second loop because you set the range from 0, to the length of the list, it really is unnecessary since you can condense that into 1 line.
Try:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
import csv
my_url = "http://www.example.com"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
links = page_soup.select('dt > a[href]')
link = [tag.get('href') for tag in links]
for url in link:
Client = uReq(url)
pageHtml = Client.read()
Client.close()
pSoup = soup(pageHtml, "html.parser")
linkeas = pSoup.findAll(href=re.compile(my_url))
def linkas(href):
return href and re.compile("html").search(href) and re.compile(my_url).search(href)
linka = pSoup.findAll(href=linkas)
if linka != []:
linkia = [tag.get('href') for tag in linka]
for curl in linkia:
cClient = uReq(curl)
pageHtml = cClient.read()
cClient.close()
Soup = soup(page_html, "html.parser")
country = Soup.select('.class > a:nth-of-type(3)')
countri = country[0].text.strip()
print(countri)

push_notebook does not update bokeh chart

It is kind of a complex example, but I desperately hope to get help...
I'm using jupyter-notebook 5.2.0, bokeh version is 0.12.9 and ipywidgets is 7.0.1.
Here is my DataFrame df:
import numpy as np
import pandas as pd
import datetime
import string
start = int(datetime.datetime(2017,1,1).strftime("%s"))
end = int(datetime.datetime(2017,12,31).strftime("%s"))
# set parameters of DataFrame df for simualtion
size, numcats = 100,10
rints = np.random.randint(start, end + 1, size = size)
df = pd.DataFrame(rints, columns = ['zeit'])
df["bytes"] = np.random.randint(5,20,size=size)
df["attr1"] = np.random.randint(5,100,size=size)
df["ind"] = ["{}{}".format(i,j) for i in string.ascii_uppercase for j in string.ascii_uppercase][:len(df)]
choices = list(string.ascii_uppercase)[:numcats]
df['who']= np.random.choice(choices, len(df))
df["zeit"] = pd.to_datetime(df["zeit"], unit='s')
df.zeit = df.zeit.dt.date
df.sort_values('zeit', inplace = True)
df = df.reset_index(drop=True)
df.head(3)
Now, let's create a bar plot, also using hover tool:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
import ipywidgets as widgets
output_notebook()
# setup figure
hover = HoverTool(tooltips=[
("index", "$index"),
("ind", "#ind"),
("who", "#who"),
("bytes", "#bytes"),
("attr1", "#attr1"),
])
fig = figure(x_range=list(df.ind), plot_height=250, title="Test Bars",
toolbar_location=None, tools=[hover])
x = fig.vbar(x="ind", top="bytes", width=0.9, source=ColumnDataSource(df))
h=show(fig, notebook_handle=True)
I'm using a ipywidgets.widgets.SelectionRangeSlider to select a range of dates:
import ipywidgets as widgets
# create slider
dates = list(pd.date_range(df.zeit.min(), df.zeit.max(), freq='D'))
options = [(i.strftime('%d.%m.%Y'), i) for i in dates]
index = (0, len(dates)-1)
myslider = widgets.SelectionRangeSlider(
options = options,
index = index,
description = 'Test',
orientation = 'horizontal',
layout={'width': '500px'}
)
def update_source(df, start, end):
x = df[(df.zeit >= start) & (df.zeit < end)]
#data = pd.DataFrame(x.groupby('who')['bytes'].sum())
#data.sort_values(by="bytes", inplace=True)
#data.reset_index(inplace=True)
#return data
return x
def gui(model, bars):
def myupdate(control1):
start = control1[0].date()
end = control1[1].date()
#display(update_source(model, start, end).head(4))
data = update_source(model, start, end)
return myupdate
widgets.interactive(gui(df, x), control1 = myslider)
The problem is, I can't get an update to the graph from the widget:
x.data_source = ColumnDataSource(update_source(df, myslider.value[0].date(), myslider.value[1].date()))
push_notebook(handle=h)
At least, it does something with the plot, as hover is not working anymore...
What am I missing? Or is this a bug?
Thanks for any help
Markus
Figured out how to do it using bokeh: https://github.com/bokeh/bokeh/issues/7082, but unfortunately it only works sometimes...
Best to use CDSViewer.

Patch glyph not updated when using multiple ColumnDataSources in bokeh app

I am trying to use the bokeh server to plot a time series together with a shaded percentile band around, and this, since bokeh does not support the fill_between function from matplotlib, requires the construction of a patch object of double dimension. Hence, I need two ColumnDataSources to hold the data. However, only the first curve is rendered correctly when the data changes. Although the data_source of the GlyphRenderer is updated, the figure does not change. I use bokeh 0.12.3, and have tried with several servers and browsers. A complete, and reasonably minimal example:
import numpy as np
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.layouts import column
from bokeh.io import curdoc
from bokeh.models.widgets import Select
class AppData:
def __init__(self, n):
self.p_source = None
self.c_source = None
self.x = np.linspace(0, 10, 20)
self.n = n
self.ys = [np.sin(self.x) - i for i in range(self.n)]
self.line = None
self.patch = None
def update_module(self, a, b):
assert b - a == 5
p_data = dict() if self.p_source is None else self.p_source.data
c_data = dict() if self.c_source is None else self.c_source.data
ys = [self.ys[j] for j in range(a, b)]
if "x" not in c_data:
c_data["x"] = self.x
p_data["x"] = c_data["x"].tolist() + c_data["x"][::-1].tolist()
n_r = len(ys[0])
n_p = 2*n_r
if "ys" not in p_data:
p_data["ys"] = np.empty((n_p))
p_data["ys"][:n_r] = ys[0]
p_data["ys"][n_r:] = np.flipud(ys[-1])
c_data["y"] = ys[2]
if self.p_source is None:
self.p_source = ColumnDataSource(data=p_data)
else:
self.p_source.data.update(p_data)
if self.c_source is None:
self.c_source = ColumnDataSource(data=c_data)
else:
self.c_source.data.update(c_data)
if self.line is not None:
print(max(self.line.data_source.data["y"]))
print(max(self.patch.data_source.data["ys"])) # The value changes, but the figure does not!
# initialize
app_data = AppData(10)
app_data.update_module(4, 4 + 5)
s1 = figure(width=500, plot_height=125, title=None, toolbar_location="above")
app_data.line = s1.line("x", "y", source=app_data.c_source)
app_data.patch = s1.patch("x", "ys", source=app_data.p_source, alpha=0.3, line_width=0)
select = Select(title="Case", options=[str(i) for i in range(5)], value="4")
def select_case(attrname, old, new):
a = int(select.value)
app_data.update_module(a, a + 5)
select.on_change('value', select_case)
layout = column(select, s1)
curdoc().add_root(layout)
curdoc().title = "Example of patches not being updated"
I am certainly not very experienced in using bokeh, so I could very well be using the system wrong. However, any help on this matter would be of great help!

Microsoft Azure Machine Learning and Cognitive Services API

Is it possible to call Cognitive Services API in Azure ML studio when build model?” any document our sample experiment can be reference?
Thanks in advance.
Here is the sample code you can try:
import urllib2
import urllib
import sys
import base64
import json
import numpy as np
import pandas as pd
# The entry point function can contain up to two input arguments:
# Param<dataframe1>: a pandas.DataFrame
# Param<dataframe2>: a pandas.DataFrame
def azureml_main(dataframe1 = None, dataframe2 = None):
# Execution logic goes here
#print('Input pandas.DataFrame #1:\r\n\r\n{0}'.format(dataframe1))
# Account key is for Ted Way
account_key = str(dataframe2['Col1'][0])
#account_key = 'api_key'
#base_url = 'https://api.datamarket.azure.com/data.ashx/amla/text-analytics/v1'
#base_url = str(dataframe2['Col2'][0])
base_url = 'https://westus.api.cognitive.microsoft.com/'
headers = {'Content-Type':'application/json', 'Ocp-Apim-Subscription-Key':account_key}
#input_text = sys.argv[2]
sentiment_scores = []
num_examples = len(dataframe1.index)
input_texts = '{"documents":['
#for each record
for i in range(0,num_examples):
input_text = str(dataframe1['Text'][i])
input_text = input_text.replace("\"", "'")
#params = { 'Text': input_text}
input_texts = input_texts + '{"id":"' + str(i) + '","text":"'+ input_text + '"},'
input_texts = input_texts + ']}'
print input_texts
# Detect sentiment.
batch_sentiment_url = base_url + 'text/analytics/v2.0/sentiment'
req = urllib2.Request(batch_sentiment_url, input_texts, headers)
response = urllib2.urlopen(req)
result = response.read()
obj = json.loads(result)
for sentiment_analysis in obj['documents']:
sentiment_scores.append( str(sentiment_analysis['score']))
#print('Sentiment score: ' + str(obj['Score']))
sentiment_scores = pd.Series(np.array(sentiment_scores))
df1 = pd.DataFrame({'SentimentScore':sentiment_scores})
# Don't return the original text'
#frames = [dataframe1, df1]
#dataframe1 = pd.concat(frames, axis=1)
# Return value must be of a sequence of pandas.DataFrame
return df1
It is possible to execute Python snippets inside Azure ML. From there, you may call the Microsoft Cognitive Services API using a Python interface (take a look at the example for the Face API from Python).

Resources