How Do I scrape Multiple pages and write data into excel - web-scraping

How to sprape multiple pages in excel.
for example i want to scrape "http://econpy.pythonanywhere.com/ex/001.html"
how to scrape next pages considering number of pages are unknown
Plus i have written a code it prints nonetype in excel but not the data
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1, Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
salmon = print(Header1.get_text())
salam = print(Header2.get_text())
f.write("{}".format(salmon)+ "{}".format(salam))
f.close()
What i am doing wrong?

Give this a try and let me know if you have any issues. I used "css selector" and "requests" for the operation to be accomplished.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('Mynew.csv', 'w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Name","Price"])
for page in range(1,6):
html = requests.get("http://econpy.pythonanywhere.com/ex/00{0}.html".format(page))
soup = BeautifulSoup(html.text, "html.parser")
for item in soup.select("div[title=buyer-info]"):
Header1 = item.select_one("div[title=buyer-name]").get_text()
Header2 = item.select_one("span.item-price").get_text()
writer.writerow([Header1, Header2])
print(Header1,Header2)
outfile.close()

i got it solved till 1st page... and this is the code
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1,Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
f.write('{},{}\n'.format(Header1.text, Header2.text))
f.close()
now the pain comes how to spacre for mulitple pages , means how to scrape next pages also?

Related

How can I save this scrapped data into a data frame?

Hey this is my code that i used to scrap some data from website for practise. Can you help me set it into a data frame and save it?
url = "https://aedownload.com/download-magazine-promo-for-element-3d-free-videohive/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(class_="blog-title").text.strip()
project_details = soup.find( class_="project-details").text
link_wp = soup.find (class_="wp-video-shortcode").text
link_infopage = soup.find(class_="infopage112").text
project_description = soup.find(class_= "Project-discription").text
print(title)
print(project_details)
print(link_wp)
print(link_infopage)
print(project_description)
Create an empty dictionary and append items to dict1 and use pandas to create dataframe
dict1={}
dict1['title'] = soup.find(class_="blog-title").text.strip()
dict1['project_details'] = soup.find( class_="project-details").text
dict1['link_wp'] = soup.find (class_="wp-video-shortcode").text
dict1['link_infopage'] = soup.find(class_="infopage112").text
dict1['project_description'] = soup.find(class_= "Project-discription").text
import pandas as pd
df = pd.DataFrame()
df = df.append(dict1, ignore_index=True)
Output:
title project_details link_wp link_infopage project_description
0 Download Magazine Promo for Element 3D – FREE ... \nMagazine Promo for Element 3D 23030644 Video... https://previews.customer.envatousercontent.co... Buy it \nFree Download\n\n\n\n\n\n\nRelated Templates...
To create new DataFrame from the data you can try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://aedownload.com/download-magazine-promo-for-element-3d-free-videohive/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(class_="blog-title").text.strip()
project_details = soup.find(class_="project-details").text
link_wp = soup.find(class_="wp-video-shortcode").text
link_infopage = soup.find(class_="infopage112").text
project_description = soup.find(class_="Project-discription").text
df = pd.DataFrame(
{
"title": [title],
"project_details": [project_details],
"link_wp": [link_wp],
"link_infopage": [link_infopage],
"project_description": [project_description],
}
)
df.to_csv("data.csv", index=False)
Saves data.csv (screenshot from LibreOffice):

Prices webscraping using BeautifulSoup

Goal: I'm trying to scrape prices
Expected Output: 2 columns 1)productName (OK) 2)price (Not OK, I have NaN)
I tried the following:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import time
urllib3.disable_warnings()
t0 = time.time()
page_proximus = urlopen("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus, 'html.parser')
scrap_list=pd.DataFrame(columns =['Item_name','Item_price'])
url = 'https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html'+ str(page_list)
req = urllib3
res = req.request
soup = BeautifulSoup(page_proximus, 'html.parser')
html = urlopen('https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html').read().decode("utf-8")
bs = BeautifulSoup(html, 'html.parser')
scrap_name = bs.find_all(["h1"])
product_name=pd.DataFrame(scrap_name,columns =['Item_name'])
scrap_price = bs.find_all ("span",{'class': 'rs-unit'})
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
scrap_list=scrap_list.append(pd.concat([product_name['Item_name'], product_price['Item_price']],
axis=1))
t1 = time.time()
r=t1-t0
print(r)
print(scrap_list)
The data is within the <meta> tags.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
t0 = time.time()
page_proximus = requests.get("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus.text, 'html.parser')
rows = []
metaData = soup.find_all('meta',{'property':'og:description'})
for meta in metaData:
row = {'Item_name':meta.find('meta',{'name':'device_model'})['content'],
'Item_price':meta.find('meta',{'name':'device_price'})['content']}
rows.append(row)
t1 = time.time()
r=t1-t0
print(r)
df = pd.DataFrame(rows)
print(df)
Output:
Item_name Item_price
0 iPhone 13 256GB Pink 1029,99

my result only get 1 data, can you help me for get more data and save another file?

import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'http://h1.nobbd.de/index.php?start='
for page in range(1,10):
req = requests.get(URL + str(page) + '=')
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data = {"Company": [layanan], "Title:": [report], "Submit:": [bug_hunter], "Link:": [mirror], "Date:": [waktu]}
df = pd.DataFrame(data)
my result only get 1 data, can you help me for get more data and save another file?
df.head()
index
Company
Title:
Submit:
Link:
Date:
0
Reddit
Application level DOS at Login Page ( Accepts Long Password )
e100_speaks
https://hackerone.com/reports/1168804
03 Feb 2022
What happens?
Based on your questions code, you will overwrite your dataframe with every iteration, thats why you only get one result.
How to fix?
Create an empty list before your loops
Append all the extracted dicts to this list
Create your dataframe based on that list of dicts
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
url = 'http://h1.nobbd.de/index.php?start='
for page in range(1,3):
req = requests.get(url + str(page))
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data.append({'Company':[layanan], 'Title':[report], 'Submit':[hunter], 'link':[mirror], 'Date':[waktu]})
df = pd.DataFrame(data)

How to extract title of each product using python web scraping

Here is the link:https://www.118100.se/sok/foretag/?q=brf&loc=&ob=rel&p=0
def get_index_data(soup):
try:
links = soup.find_all('div','a',id=False).get('href')
except:
links = []
print(links)
Find all div, which has class name Name (class="Name"). which gives you all title names. If you want href then iterate through all titles and find a tag which has title is text of title.text.
import requests
import bs4 as bs
url = 'https://www.118100.se/sok/foretag/?q=brf&loc=&ob=rel&p=0'
response = requests.get(url)
# print('Response:', response.status_code)
soup = bs.BeautifulSoup(response.text, 'lxml')
titles = soup.find_all('div', {'class': 'Name'})
# a = soup.find_all('a')
# print(a)
for title in titles:
link = soup.find('a', {'title': title.text}).get('href')
print('https://www.118100.se' + link)

Unable to get data into correct format after web scraping in python

i have written a code that scrapes the websites: https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
but when i run this code, data is not formtted, like product name is coming in ever cell and so on price and image.
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("Scrapedetails.csv", "w")
Headers = "Item_Name, Price, Image\n"
f.write(Headers)
for page in range(1,15):
page_url = "https://www.newegg.com/Product/ProductList.aspx?
Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
html = urlopen(page_url)
bs0bj = BeautifulSoup(html, "html.parser")
page_details = bs0bj.find_all("div", {"class":"item-container"})
for i in page_details:
Item_Name = i.find("a", {"class":"item-title"})
Price = i.find("li", {"class":"price-current"})
Image = i.find("img")
Name_item = Item_Name.get_text()
Prin = Price.get_text()
imgf = Image["src"]# to get the key src
f.write("{}".format(Name_item).strip()+ ",{}".format(Prin).strip()+
",{}".format(imgf)+ "\n")
f.close()
can someone help me to ammend codes so that i can get name in name column, price in price column and image in image column.
what are the new ways to save data in csv,can someone help me in it with codes too?
Alright i got it solved.
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("Scrapedetails.csv", "w")
Headers = "Item_Name, Price, Image\n"
f.write(Headers)
for page in range(1,15):
page_url = "https://www.newegg.com/Product/ProductList.aspx?
Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
html = urlopen(page_url)
bs0bj = BeautifulSoup(html, "html.parser")
page_details = bs0bj.find_all("div", {"class":"item-container"})
for i in page_details:
Item_Name = i.find("a", {"class":"item-title"})
Price = i.find("li", {"class":"price-current"}).find('strong')
Image = i.find("img")
Name_item = Item_Name.get_text().strip()
prin = Price.get_text()
imgf = Image["src"]# to get the key src
print(Name_item)
print(prin)
print('https:{}'.format(imgf))
f.write("{}".format(Name_item).replace(",", "|")+ ",{}".format(prin)+ ",https:{}".format(imgf)+ "\n")
f.close()
These are the codes for anyone who wishes to start with webscraping a simplest way

Resources