How to scrape this page with BeautifulSoup? - web-scraping

am trying to scrape the below page by using the below code in BeautifulSoup
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import lxml
url = 'https://remittanceprices.worldbank.org/en/corridor/Australia/China'
page=urlopen(url)
bs = BeautifulSoup(page,"lxml")
print(bs.get_text())
all_links=bs.find_all("div", {"class":"views-field views-field-title" })
for link in all_links:
content=link.get_text()
print (content)
all_links=bs.find_all("div", {"class":"mobile-header" })
for link in all_links:
content=link.get_text()
print (content)
Can you please provide some pointers to print/extract the data for all firms in the below format
Firm|product|Fee|Exchange rate margin(%)|Total Cost Percent(%)|Total Cost(AUD)
Bank of China|28.00|5.77|19.77|39.54
ANZ Bank|32.00|4.39|20.39|40.78
Regards
-Abacus

import requests
from bs4 import BeautifulSoup
url = 'https://remittanceprices.worldbank.org/en/corridor/Australia/China'
r = requests.get(url,verify=False)
soup = BeautifulSoup(r.text,'lxml')
rows = [i.get_text("|").split("|") for i in soup.select('#tab-1 .corridor-row')]
for row in rows:
#a,b,c,d,e = row[2],row[15],row[18],row[21],row[25]
#print(a,b,c,d,e,sep='|')
print('{0[2]}|{0[15]}|{0[18]}|{0[21]}|{0[25]}'.format(row))
Citibank|0.00|1.53|1.53|3.06
Transferwise|5.05|-0.04|2.48|4.96
Western Union|5.00|1.19|3.69|7.38
MoneyGram|8.00|1.06|5.06|10.12
WorldRemit|7.99|1.30|5.30|10.60
Ria|10.00|0.84|5.84|11.68
Ceylon Exchange|10.00|1.37|6.37|12.74
Western Union|9.95|1.69|6.66|13.32
Orbit Remit|13.00|0.78|7.28|14.56
Money2anywhere|12.00|1.71|7.71|15.42
SUPAY|18.00|-1.24|7.76|15.52
Money Chain Foreign Exchange|18.00|-1.12|7.88|15.76
MoneyGram|15.00|1.30|8.80|17.60
Commonwealth Bank|22.00|3.43|14.43|28.86
Bank of China|28.00|1.50|15.50|31.00
ANZ Bank|24.00|4.51|16.51|33.02
National Australia Bank (NAB)|22.00|5.74|16.74|33.48
Bank of China|32.00|1.50|17.50|35.00
Commonwealth Bank|30.00|3.43|18.43|36.86
ANZ Bank|32.00|4.51|20.51|41.02
National Australia Bank (NAB)|30.00|5.74|20.74|41.48

Related

How to extract title name and rating of a movie from IMDB database?

I'm very new to web scrapping in python. I want to extract the movie name, release year, and ratings from the IMDB database. This is the website for IMBD with 250 movies and ratings https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I use the module, BeautifulSoup, and request. Here is my code
movies = bs.find('tbody',class_='lister-list').find_all('tr')
When I tried to extract the movie name, rating & year, I got the same attribute error for all of them.
<td class="title column">
Glass Onion: une histoire à couteaux tirés
<span class="secondary info">(2022)</span>
<div class="velocity">1
<span class="secondary info">(
<span class="global-sprite telemeter up"></span>
1)</span>
<td class="ratingColumn imdbRating">
<strong title="7,3 based on 207 962 user ratings">7,3</strong>strong text
title = movies.find('td',class_='titleColumn').a.text
rating = movies.find('td',class_='ratingColumn imdbRating').strong.text
year = movies.find('td',class_='titleColumn').span.text.strip('()')
AttributeError Traceback (most recent call last)
<ipython-input-9-2363bafd916b> in <module>
----> 1 title = movies.find('td',class_='titleColumn').a.text
2 title
~\anaconda3\lib\site-packages\bs4\element.py in getattr(self, key)
2287 def getattr(self, key):
2288 """Raise a helpful exception to explain a common code fix."""
-> 2289 raise AttributeError(
2290 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2291 )
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Can someone help me to solve the problem? Thanks in advance!
To get the ResultSets as list, you can try the next example.
from bs4 import BeautifulSoup
import requests
import pandas as pd
data = []
res = requests.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I")
#print(res)
soup = BeautifulSoup(res.content, "html.parser")
for card in soup.select('.chart.full-width tbody tr'):
data.append({
"title": card.select_one('.titleColumn a').get_text(strip=True),
"year": card.select_one('.titleColumn span').text,
'rating': card.select_one('td[class="ratingColumn imdbRating"]').get_text(strip=True)
})
df = pd.DataFrame(data)
print(df)
#df.to_csv('out.csv', index=False)
Output:
title year rating
0 Avatar: The Way of Water (2022) 7.9
1 Glass Onion (2022) 7.2
2 The Menu (2022) 7.3
3 White Noise (2022) 5.8
4 The Pale Blue Eye (2022) 6.7
.. ... ... ...
95 Zoolander (2001) 6.5
96 Once Upon a Time in Hollywood (2019) 7.6
97 The Lord of the Rings: The Fellowship of the Ring (2001) 8.8
98 New Year's Eve (2011) 5.6
99 Spider-Man: No Way Home (2021) 8.2
[100 rows x 3 columns]
Update: To extract data using find_all and find method.
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0'}
data = []
res = requests.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I")
#print(res)
soup = BeautifulSoup(res.content, "html.parser")
for card in soup.table.tbody.find_all("tr"):
data.append({
"title": card.find("td",class_="titleColumn").a.get_text(strip=True),
"year": card.find("td",class_="titleColumn").span.get_text(strip=True),
'rating': card.find('td',class_="ratingColumn imdbRating").get_text(strip=True)
})
df = pd.DataFrame(data)
print(df)
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all returns an array, meaning that movies is an array. You need to iterate over the array with for movie in movies:
for movie in movies:
title = movie.find('td',class_='titleColumn').a.text
rating = movie.find('td',class_='ratingColumn imdbRating').strong.text
year = movie.find('td',class_='titleColumn').span.text.strip('()')

web scrap stock data from Reuters

I am a programming beginner and trying to extract key metric data (e.g. Beta) for a stock from Reuters. However, it always come back as blank.
my codes are like this:
from bs4 import BeautifulSoup as bs
import requests
import re
url = 'https://www.reuters.com/markets/companies/TSLA.OQ/key-metrics/price-and-volume'
page = requests.get(url)
bs1 = bs(page.text, 'html.parser')
beta=bs1.find_all('th', class_ ='text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__body__yKS5U body__base__22dCE body__body__VgU9Q',text=re.compile('Beta'))
print(beta)
I know it is not correct but I cannot figure out what to do. please help. Ultimate I want to be extract the Beta info for a stock from Reuters. thank you for your help!!!
You can scrape the site (without inspecting the javascript/json) using Selenium, using bs4 from my previous answer but you can use seleniums functions instead.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
# Initiate webdriver
driver = webdriver.Firefox()
# Fetch the web page
driver.get('https://www.reuters.com/markets/companies/TSLA.OQ/key-metrics/price-and-volume')
# Convert the driver page source to a soup object
soup = bs(driver.page_source, 'html.parser')
# Find the table you want to scrape
table = soup.find('table', attrs={'aria-label':'KeyMetrics'})
# Locate the Keys and Value for each of the rows
keys = [i.text for i in table.select('tbody tr th') if i]
values = [i.text for i in table.select('tbody tr td') if i]
# Convert the two lists into a dictionary for a neater output
data = dict(zip(keys,values))
driver.quit()
print(data)
This will return:
{'Price Closing Or Last Bid': '699.20', 'Pricing Date': 'Jul 05', '52 Week High': '1,243.25', '52 Week High Date': 'Nov 04', '52 Week Low': '620.50', '52 Week Low Date': 'Jul 08', '10 Day Average Trading Volume': '31.36', '3 Month Average Trading Volume': '602.72', 'Market Capitalization': '724,644.30', 'Beta': '2.13', '1 Day Price Change': '2.55', '5 Day Price Return (Daily)': '-4.84', '13 Week Price Return (Daily)': '-35.93', '26 Week Price Return (Daily)': '-39.18', '52 Week Price Return (Daily)': '2.99', 'Month To Date Price Return (Daily)': '3.83', 'Year To Date Price Return (Daily)': '-33.84', 'Price Relative To S&P500 (4 Week)': '5.95', 'Price Relative To S&P500 (13 Week)': '-24.33', 'Price Relative To S&P500 (26 Week)': '-23.90', 'Price Relative To S&P500 (52 Week)': '16.99', 'Price Relative To S&P500 (YTD)': '-17.69'}
Here's one way of collecting the data you need:
from bs4 import BeautifulSoup as bs
import requests
import re
url = 'https://www.reuters.com/markets/companies/TSLA.OQ/key-metrics/price-and-volume'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
# Locate the Table you wish to scrape
table = soup.select_one('table.table__table__2px_A')
# Locate the Keys and Value for each of the rows
keys = [i.text for i in table.select('tr th') if i]
values = [i.text for i in table.select('tr td') if i]
# Convert the two lists into a dictionary for a neater output
data = dict(zip(keys,values))
This will return:
{'% Change': '671.00',
'Brent Crude Oil': '-1.40%Negative',
'CBOT Soybeans': '1,626.00',
'Copper': '111.91',
'Future': '1,805.20',
'Gold': '-0.57%Negative',
'Last': '+0.35%Positive'}

Prices webscraping using BeautifulSoup

Goal: I'm trying to scrape prices
Expected Output: 2 columns 1)productName (OK) 2)price (Not OK, I have NaN)
I tried the following:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import time
urllib3.disable_warnings()
t0 = time.time()
page_proximus = urlopen("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus, 'html.parser')
scrap_list=pd.DataFrame(columns =['Item_name','Item_price'])
url = 'https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html'+ str(page_list)
req = urllib3
res = req.request
soup = BeautifulSoup(page_proximus, 'html.parser')
html = urlopen('https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html').read().decode("utf-8")
bs = BeautifulSoup(html, 'html.parser')
scrap_name = bs.find_all(["h1"])
product_name=pd.DataFrame(scrap_name,columns =['Item_name'])
scrap_price = bs.find_all ("span",{'class': 'rs-unit'})
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
scrap_list=scrap_list.append(pd.concat([product_name['Item_name'], product_price['Item_price']],
axis=1))
t1 = time.time()
r=t1-t0
print(r)
print(scrap_list)
The data is within the <meta> tags.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
t0 = time.time()
page_proximus = requests.get("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus.text, 'html.parser')
rows = []
metaData = soup.find_all('meta',{'property':'og:description'})
for meta in metaData:
row = {'Item_name':meta.find('meta',{'name':'device_model'})['content'],
'Item_price':meta.find('meta',{'name':'device_price'})['content']}
rows.append(row)
t1 = time.time()
r=t1-t0
print(r)
df = pd.DataFrame(rows)
print(df)
Output:
Item_name Item_price
0 iPhone 13 256GB Pink 1029,99

Why Does This Scrape Stop After 1st Iteration?

My code access a page where each row may or may not have a drop down where more information exists.
I have a try and except statement to check for this.
Works fine in line 1, but not line 2?
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
gg=[]
r = requests.get('https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=2')
soup = bs(r.text, 'lxml')
sessions = soup.select('#accordin > ul > li')
for session in sessions:
jj=(session.select_one('h4').text)
print(jj)
sub_session = session.select('.sub_accordin_presentation')
try:
if sub_session:
kk=([re.sub(r'[\n\s]+', ' ', i.text) for i in sub_session])
print(kk)
except:
kk=' '
dict={"Title":jj,"Sub":kk}
gg.append(dict)
df=pd.DataFrame(gg)
df.to_csv('test2.csv')
To get all sections + sub-sections, try:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get(
"https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=2"
)
soup = bs(r.text, "lxml")
sessions = soup.select("#accordin > ul > li")
gg = []
for session in sessions:
jj = session.h4.get_text(strip=True, separator=" ")
sub_sessions = session.select(".sub_accordin_presentation")
if sub_sessions:
for sub_session in sub_sessions:
gg.append(
{
"Title": jj,
"Sub": sub_session.h4.get_text(strip=True, separator=" "),
}
)
else:
gg.append(
{
"Title": jj,
"Sub": "None",
}
)
df = pd.DataFrame(gg)
df.to_csv("data.csv", index=False)
print(df)
Prints:
Title Sub
0 IS05 - Industry Symposium Sponsored by Amgen: Advancing Lung Cancer Treatment with Novel Therapeutic Targets None
1 IS06 - Industry Symposium Sponsored by Jazz Pharmaceuticals: Exploring a Treatment Option for Patients with Previously Treated Metastatic Small Cell Lung Cancer (SCLC) None
2 IS07 - Satellite CME Symposium by Sanofi Genzyme: On the Frontline: Immunotherapeutic Approaches in Advanced NSCLC None
3 PL02A - Plenary 2: Presidential Symposium (Rebroadcast) (Japanese, Mandarin, Spanish Translation Available) PL02A.01 - Durvalumab ± Tremelimumab + Chemotherapy as First-line Treatment for mNSCLC: Results from the Phase 3 POSEIDON Study
4 PL02A - Plenary 2: Presidential Symposium (Rebroadcast) (Japanese, Mandarin, Spanish Translation Available) PL02A.02 - Discussant
5 PL02A - Plenary 2: Presidential Symposium (Rebroadcast) (Japanese, Mandarin, Spanish Translation Available) PL02A.03 - Lurbinectedin/doxorubicin versus CAV or Topotecan in Relapsed SCLC Patients: Phase III Randomized ATLANTIS Trial
...
and creates data.csv (screenshot from LibreOffice):

How Do I scrape Multiple pages and write data into excel

How to sprape multiple pages in excel.
for example i want to scrape "http://econpy.pythonanywhere.com/ex/001.html"
how to scrape next pages considering number of pages are unknown
Plus i have written a code it prints nonetype in excel but not the data
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1, Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
salmon = print(Header1.get_text())
salam = print(Header2.get_text())
f.write("{}".format(salmon)+ "{}".format(salam))
f.close()
What i am doing wrong?
Give this a try and let me know if you have any issues. I used "css selector" and "requests" for the operation to be accomplished.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('Mynew.csv', 'w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Name","Price"])
for page in range(1,6):
html = requests.get("http://econpy.pythonanywhere.com/ex/00{0}.html".format(page))
soup = BeautifulSoup(html.text, "html.parser")
for item in soup.select("div[title=buyer-info]"):
Header1 = item.select_one("div[title=buyer-name]").get_text()
Header2 = item.select_one("span.item-price").get_text()
writer.writerow([Header1, Header2])
print(Header1,Header2)
outfile.close()
i got it solved till 1st page... and this is the code
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1,Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
f.write('{},{}\n'.format(Header1.text, Header2.text))
f.close()
now the pain comes how to spacre for mulitple pages , means how to scrape next pages also?

Resources