Web scraping relevant information from soup file - web-scraping

I am trying to scrape this particular url to obtain information on branch/atm name and location address.
url="https://www.bankmayapada.com/en/contactus/location-information"
However, the soup file I get is pretty confusing and I am unable to figure out how to extract the required information.
The information I need is Branch/Atm Name and its corresponding address. Right now, I am just figuring out the structure of the soup file.
import re
import requests
from bs4 import BeautifulSoup
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())

You can get that table's data with a single POST request. Fun fact, no payload required!
Here's how:
import requests
from bs4 import BeautifulSoup
page = requests.post("https://myapps.bankmayapada.com/frontend/IN/lokasi.aspx").text
rows = BeautifulSoup(page, "html.parser").find_all("tr", {"class": "dxgvDataRow"})
branch_location_data = []
for row in rows:
province, area, location = row.find_all("td")
branch_location_data.append(
[
province.getText(strip=True), # province column
area.getText(strip=True), # area column
location.find("b").getText(strip=True), # Branch name
" ".join(
d.getText() for d in location.find_all("div") # branch address
if not d.getText().startswith(("Tel", "Fax")) # skipping Phone & Fax info
),
]
)
for branch in branch_location_data:
print(branch)
Output:
['DKI JAKARTA', 'Jakarta Barat', 'Kantor Capem Citra Garden 2', 'Rukan Citra Niaga Blok A-7 Jl. Utan Jati - Kalideres Jakarta - DKI Jakarta']
['DKI JAKARTA', 'Jakarta Barat', 'Kantor Capem Puri Indah', 'Jl. Puri Indah Raya Blok I No. 2 Jakarta 11610 - DKI Jakarta']
['DKI JAKARTA', 'Jakarta Barat', 'Kantor Capem Pasar Pagi Asemka', 'Jl. Pasar Pagi No. 84 Jakarta - DKI Jakarta']
['DKI JAKARTA', 'Jakarta Barat', 'Kantor Capem Tanjung Duren', 'Jl. Tanjung Duren No. 91 B Jakarta 11470 - DKI Jakarta']
['DKI JAKARTA', 'Jakarta Barat', 'Kantor Capem Meruya', 'Jl. Meruya Ilir Raya No. 82 G Jakarta - DKI Jakarta']
['DKI JAKARTA', 'Jakarta Barat', 'Kantor Capem Jembatan Lima', 'Jl. KH Moch. Mansyur No. 24 A Jakarta - DKI Jakarta']
and so on...

Related

How to extract title name and rating of a movie from IMDB database?

I'm very new to web scrapping in python. I want to extract the movie name, release year, and ratings from the IMDB database. This is the website for IMBD with 250 movies and ratings https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I use the module, BeautifulSoup, and request. Here is my code
movies = bs.find('tbody',class_='lister-list').find_all('tr')
When I tried to extract the movie name, rating & year, I got the same attribute error for all of them.
<td class="title column">
Glass Onion: une histoire à couteaux tirés
<span class="secondary info">(2022)</span>
<div class="velocity">1
<span class="secondary info">(
<span class="global-sprite telemeter up"></span>
1)</span>
<td class="ratingColumn imdbRating">
<strong title="7,3 based on 207 962 user ratings">7,3</strong>strong text
title = movies.find('td',class_='titleColumn').a.text
rating = movies.find('td',class_='ratingColumn imdbRating').strong.text
year = movies.find('td',class_='titleColumn').span.text.strip('()')
AttributeError Traceback (most recent call last)
<ipython-input-9-2363bafd916b> in <module>
----> 1 title = movies.find('td',class_='titleColumn').a.text
2 title
~\anaconda3\lib\site-packages\bs4\element.py in getattr(self, key)
2287 def getattr(self, key):
2288 """Raise a helpful exception to explain a common code fix."""
-> 2289 raise AttributeError(
2290 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2291 )
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Can someone help me to solve the problem? Thanks in advance!
To get the ResultSets as list, you can try the next example.
from bs4 import BeautifulSoup
import requests
import pandas as pd
data = []
res = requests.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I")
#print(res)
soup = BeautifulSoup(res.content, "html.parser")
for card in soup.select('.chart.full-width tbody tr'):
data.append({
"title": card.select_one('.titleColumn a').get_text(strip=True),
"year": card.select_one('.titleColumn span').text,
'rating': card.select_one('td[class="ratingColumn imdbRating"]').get_text(strip=True)
})
df = pd.DataFrame(data)
print(df)
#df.to_csv('out.csv', index=False)
Output:
title year rating
0 Avatar: The Way of Water (2022) 7.9
1 Glass Onion (2022) 7.2
2 The Menu (2022) 7.3
3 White Noise (2022) 5.8
4 The Pale Blue Eye (2022) 6.7
.. ... ... ...
95 Zoolander (2001) 6.5
96 Once Upon a Time in Hollywood (2019) 7.6
97 The Lord of the Rings: The Fellowship of the Ring (2001) 8.8
98 New Year's Eve (2011) 5.6
99 Spider-Man: No Way Home (2021) 8.2
[100 rows x 3 columns]
Update: To extract data using find_all and find method.
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0'}
data = []
res = requests.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I")
#print(res)
soup = BeautifulSoup(res.content, "html.parser")
for card in soup.table.tbody.find_all("tr"):
data.append({
"title": card.find("td",class_="titleColumn").a.get_text(strip=True),
"year": card.find("td",class_="titleColumn").span.get_text(strip=True),
'rating': card.find('td',class_="ratingColumn imdbRating").get_text(strip=True)
})
df = pd.DataFrame(data)
print(df)
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all returns an array, meaning that movies is an array. You need to iterate over the array with for movie in movies:
for movie in movies:
title = movie.find('td',class_='titleColumn').a.text
rating = movie.find('td',class_='ratingColumn imdbRating').strong.text
year = movie.find('td',class_='titleColumn').span.text.strip('()')

Why Does This Scrape Stop After 1st Iteration?

My code access a page where each row may or may not have a drop down where more information exists.
I have a try and except statement to check for this.
Works fine in line 1, but not line 2?
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
gg=[]
r = requests.get('https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=2')
soup = bs(r.text, 'lxml')
sessions = soup.select('#accordin > ul > li')
for session in sessions:
jj=(session.select_one('h4').text)
print(jj)
sub_session = session.select('.sub_accordin_presentation')
try:
if sub_session:
kk=([re.sub(r'[\n\s]+', ' ', i.text) for i in sub_session])
print(kk)
except:
kk=' '
dict={"Title":jj,"Sub":kk}
gg.append(dict)
df=pd.DataFrame(gg)
df.to_csv('test2.csv')
To get all sections + sub-sections, try:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get(
"https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=2"
)
soup = bs(r.text, "lxml")
sessions = soup.select("#accordin > ul > li")
gg = []
for session in sessions:
jj = session.h4.get_text(strip=True, separator=" ")
sub_sessions = session.select(".sub_accordin_presentation")
if sub_sessions:
for sub_session in sub_sessions:
gg.append(
{
"Title": jj,
"Sub": sub_session.h4.get_text(strip=True, separator=" "),
}
)
else:
gg.append(
{
"Title": jj,
"Sub": "None",
}
)
df = pd.DataFrame(gg)
df.to_csv("data.csv", index=False)
print(df)
Prints:
Title Sub
0 IS05 - Industry Symposium Sponsored by Amgen: Advancing Lung Cancer Treatment with Novel Therapeutic Targets None
1 IS06 - Industry Symposium Sponsored by Jazz Pharmaceuticals: Exploring a Treatment Option for Patients with Previously Treated Metastatic Small Cell Lung Cancer (SCLC) None
2 IS07 - Satellite CME Symposium by Sanofi Genzyme: On the Frontline: Immunotherapeutic Approaches in Advanced NSCLC None
3 PL02A - Plenary 2: Presidential Symposium (Rebroadcast) (Japanese, Mandarin, Spanish Translation Available) PL02A.01 - Durvalumab ± Tremelimumab + Chemotherapy as First-line Treatment for mNSCLC: Results from the Phase 3 POSEIDON Study
4 PL02A - Plenary 2: Presidential Symposium (Rebroadcast) (Japanese, Mandarin, Spanish Translation Available) PL02A.02 - Discussant
5 PL02A - Plenary 2: Presidential Symposium (Rebroadcast) (Japanese, Mandarin, Spanish Translation Available) PL02A.03 - Lurbinectedin/doxorubicin versus CAV or Topotecan in Relapsed SCLC Patients: Phase III Randomized ATLANTIS Trial
...
and creates data.csv (screenshot from LibreOffice):

How to split elements inside <p> tag while web scraping

I am trying to scrape url. However the output is not in the desired format. I need just the Branch name and address. How do I split this information from p tag.
import re
import requests
from bs4 import BeautifulSoup
page = requests.get(url)
Branch_list=[]
soup = BeautifulSoup(page.content, 'html.parser')
for i in soup.find_all('div',class_="col-md-9 text-left"):
Branch=i.find_all('p') if i.find_all('p') else ''
for k in Branch:
k=re.sub(r'<(.*?)>','', str(k))
Branch_list.append(k)
Try this:
import re
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.bukopin.co.id/page/jaringankantor")
soup = BeautifulSoup(page.text, 'html.parser').find_all('div', class_="col-md-9 text-left")
paragraphs = [re.sub(r"Tel.+", "", p.find("p").getText(strip=True)) for p in soup]
for paragraph in paragraphs:
print(paragraph)
Output:
KCP Rasuna SaidGd. Kementerian Koperasi & UKM, Lt. 1. Jl. HR. Rasuna Said Kav. 3 - 5, Jakarta Selatan 12940
KCP Plaza AsiaJl. Jend. Sudirman Kav. 59 No. 77 Lt. GF No. GF - D Blok A Senayan, Kebayoran Baru, Jakarta Selatan
KCP Bulog IIGedung Diklat Bulog II Jl. Kuningan Timur Blok M2 No.5 Jakarta Selatan 12950
KCP Pondok Indah Plaza VPlaza V Pondok Indah Kav.A11 Jl. Marga Guna Raya - Pondok Indah Jakarta Selatan
KCP Kebayoran LamaJl. Raya Kebayoran Lama No.10 Jakarta Selatan 12220
KCP Kebayoran BaruJl. RS. Fatmawati No.7 Blok A Kebayoran Baru Jakarta Selatan12140
KCP MelawaiJl. Melawai Raya Kebayoran Baru No. 66 Jakarta Selatan 12160
KK PLN Lenteng AgungJl. Raya Tanjung Barat No. 55 Jakarta Selatan 12610
and so on...
EDIT: To get this into a pandas dataframe try this:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
page = requests.get("https://www.bukopin.co.id/page/jaringankantor")
soup = BeautifulSoup(page.text, 'html.parser').find_all('div', class_="col-md-9 text-left")
data = []
for div in soup:
branch = div.find("strong").getText()
address = div.find("p").getText(strip=True)
data.append([branch, re.sub(r"Telp.+", "", address[len(branch):])])
print(pd.DataFrame(data, columns=["Branch", "Address"]))
Output:
Branch Address
0 KCP Rasuna Said Gd. Kementerian Koperasi & UKM, Lt. 1. Jl. HR....
1 KCP Plaza Asia Jl. Jend. Sudirman Kav. 59 No. 77 Lt. GF No. G...
2 KCP Bulog II Gedung Diklat Bulog II Jl. Kuningan Timur Blok...
3 KCP Pondok Indah Plaza V Plaza V Pondok Indah Kav.A11 Jl. Marga Guna Ra...
4 KCP Kebayoran Lama Jl. Raya Kebayoran Lama No.10 Jakarta Selatan ...
5 KCP Kebayoran Baru Jl. RS. Fatmawati No.7 Blok A Kebayoran Baru J...
...

Data Scraping with list in excel

I have a list in Excel. One code in Column A and another in Column B.
There is a website in which I need to input both the details in two different boxes and it takes to another page.
That page contains certain details which I need to scrape in Excel.
Any help in this?
Ok. Give this a shot:
import pandas as pd
import requests
df = pd.read_excel('C:/test/data.xlsx')
url = 'http://rla.dgft.gov.in:8100/dgft/IecPrint'
results = pd.DataFrame()
for row in df.itertuples():
payload = {
'iec': '%010d' %row[1],
'name':row[2]}
response = requests.post(url, params=payload)
print ('IEC: %010d\tName: %s' %(row[1],row[2]))
try:
dfs = pd.read_html(response.text)
except:
print ('The name Given By you does not match with the data OR you have entered less than three letters')
temp_df = pd.DataFrame([['%010d' %row[1],row[2], 'ERROR']],
columns = ['IEC','Party Name and Address','ERROR'])
results = results.append(temp_df, sort=False).reset_index(drop=True)
continue
generalData = dfs[0]
generalData = generalData.iloc[:,[0,-1]].set_index(generalData.columns[0]).T.reset_index(drop=True)
directorData = dfs[1]
directorData = directorData.iloc[:,[-1]].T.reset_index(drop=True)
directorData.columns = [ 'director_%02d' %(each+1) for each in directorData.columns ]
try:
branchData = dfs[2]
branchData = branchData.iloc[:,[-1]].T.reset_index(drop=True)
branchData.columns = [ 'branch_%02d' %(each+1) for each in branchData.columns ]
except:
branchData = pd.DataFrame()
print ('No Branch Data.')
temp_df = pd.concat([generalData, directorData, branchData], axis=1)
results = results.append(temp_df, sort=False).reset_index(drop=True)
results.to_excel('path.new_file.xlsx', index=False)
Output:
print (results.to_string())
IEC IEC Allotment Date File Number File Date Party Name and Address Phone No e_mail Exporter Type IEC Status Date of Establishment BIN (PAN+Extension) PAN ISSUE DATE PAN ISSUED BY Nature Of Concern Banker Detail director_01 director_02 director_03 branch_01 branch_02 branch_03 branch_04 branch_05 branch_06 branch_07 branch_08 branch_09
0 0305008111 03.05.2005 04/04/131/51473/AM20/ 20.08.2019 NISSAN MOTOR INDIA PVT. LTD. PLOT-1A,SIPCOT IN... 918939917907 shailesh.kumar#rnaipl.com 5 Merchant/Manufacturer Valid IEC 2005-02-07 AACCN0695D FT001 NaN NaN 3 Private Limited STANDARD CHARTERED BANK A/C Type:1 CA A/C No :... HARDEEP SINGH BRAR GURMEL SINGH BRAR HOUSE NO ... JEROME YVES MARIE SAIGOT THIERRY SAIGOT A9/2, ... KOJI KAWAKITA KIHACHI KAWAKITA 3-21-3, NAGATAK... Branch Code:165TH FLOOR ORCHID BUSINESS PARK,S... Branch Code:14NRPDC , WAREHOUSE NO.B -2A,PATAU... Branch Code:12EQUINOX BUSINESS PARK TOWER 3 4T... Branch Code:8GRAND PALLADIUM,5TH FLR.,B WING,,... Branch Code:6TVS LOGISTICS SERVICES LTD.SING,C... Branch Code:2PLOT 1A SIPCOT INDUL PARK,ORAGADA... Branch Code:5BLDG.NO.3 PART,124A,VALLAM A,SRIP... Branch Code:15SURVEY NO. 678 679 680 681 682 6... Branch Code:10INDOSPACE SKCL INDL.PARK,BULD.NO...

How to scrape this page with BeautifulSoup?

am trying to scrape the below page by using the below code in BeautifulSoup
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import lxml
url = 'https://remittanceprices.worldbank.org/en/corridor/Australia/China'
page=urlopen(url)
bs = BeautifulSoup(page,"lxml")
print(bs.get_text())
all_links=bs.find_all("div", {"class":"views-field views-field-title" })
for link in all_links:
content=link.get_text()
print (content)
all_links=bs.find_all("div", {"class":"mobile-header" })
for link in all_links:
content=link.get_text()
print (content)
Can you please provide some pointers to print/extract the data for all firms in the below format
Firm|product|Fee|Exchange rate margin(%)|Total Cost Percent(%)|Total Cost(AUD)
Bank of China|28.00|5.77|19.77|39.54
ANZ Bank|32.00|4.39|20.39|40.78
Regards
-Abacus
import requests
from bs4 import BeautifulSoup
url = 'https://remittanceprices.worldbank.org/en/corridor/Australia/China'
r = requests.get(url,verify=False)
soup = BeautifulSoup(r.text,'lxml')
rows = [i.get_text("|").split("|") for i in soup.select('#tab-1 .corridor-row')]
for row in rows:
#a,b,c,d,e = row[2],row[15],row[18],row[21],row[25]
#print(a,b,c,d,e,sep='|')
print('{0[2]}|{0[15]}|{0[18]}|{0[21]}|{0[25]}'.format(row))
Citibank|0.00|1.53|1.53|3.06
Transferwise|5.05|-0.04|2.48|4.96
Western Union|5.00|1.19|3.69|7.38
MoneyGram|8.00|1.06|5.06|10.12
WorldRemit|7.99|1.30|5.30|10.60
Ria|10.00|0.84|5.84|11.68
Ceylon Exchange|10.00|1.37|6.37|12.74
Western Union|9.95|1.69|6.66|13.32
Orbit Remit|13.00|0.78|7.28|14.56
Money2anywhere|12.00|1.71|7.71|15.42
SUPAY|18.00|-1.24|7.76|15.52
Money Chain Foreign Exchange|18.00|-1.12|7.88|15.76
MoneyGram|15.00|1.30|8.80|17.60
Commonwealth Bank|22.00|3.43|14.43|28.86
Bank of China|28.00|1.50|15.50|31.00
ANZ Bank|24.00|4.51|16.51|33.02
National Australia Bank (NAB)|22.00|5.74|16.74|33.48
Bank of China|32.00|1.50|17.50|35.00
Commonwealth Bank|30.00|3.43|18.43|36.86
ANZ Bank|32.00|4.51|20.51|41.02
National Australia Bank (NAB)|30.00|5.74|20.74|41.48

Resources