Goal: I'm trying to scrape prices
Expected Output: 2 columns 1)productName (OK) 2)price (Not OK, I have NaN)
I tried the following:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import time
urllib3.disable_warnings()
t0 = time.time()
page_proximus = urlopen("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus, 'html.parser')
scrap_list=pd.DataFrame(columns =['Item_name','Item_price'])
url = 'https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html'+ str(page_list)
req = urllib3
res = req.request
soup = BeautifulSoup(page_proximus, 'html.parser')
html = urlopen('https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html').read().decode("utf-8")
bs = BeautifulSoup(html, 'html.parser')
scrap_name = bs.find_all(["h1"])
product_name=pd.DataFrame(scrap_name,columns =['Item_name'])
scrap_price = bs.find_all ("span",{'class': 'rs-unit'})
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
scrap_list=scrap_list.append(pd.concat([product_name['Item_name'], product_price['Item_price']],
axis=1))
t1 = time.time()
r=t1-t0
print(r)
print(scrap_list)
The data is within the <meta> tags.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
t0 = time.time()
page_proximus = requests.get("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus.text, 'html.parser')
rows = []
metaData = soup.find_all('meta',{'property':'og:description'})
for meta in metaData:
row = {'Item_name':meta.find('meta',{'name':'device_model'})['content'],
'Item_price':meta.find('meta',{'name':'device_price'})['content']}
rows.append(row)
t1 = time.time()
r=t1-t0
print(r)
df = pd.DataFrame(rows)
print(df)
Output:
Item_name Item_price
0 iPhone 13 256GB Pink 1029,99
Related
Hey this is my code that i used to scrap some data from website for practise. Can you help me set it into a data frame and save it?
url = "https://aedownload.com/download-magazine-promo-for-element-3d-free-videohive/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(class_="blog-title").text.strip()
project_details = soup.find( class_="project-details").text
link_wp = soup.find (class_="wp-video-shortcode").text
link_infopage = soup.find(class_="infopage112").text
project_description = soup.find(class_= "Project-discription").text
print(title)
print(project_details)
print(link_wp)
print(link_infopage)
print(project_description)
Create an empty dictionary and append items to dict1 and use pandas to create dataframe
dict1={}
dict1['title'] = soup.find(class_="blog-title").text.strip()
dict1['project_details'] = soup.find( class_="project-details").text
dict1['link_wp'] = soup.find (class_="wp-video-shortcode").text
dict1['link_infopage'] = soup.find(class_="infopage112").text
dict1['project_description'] = soup.find(class_= "Project-discription").text
import pandas as pd
df = pd.DataFrame()
df = df.append(dict1, ignore_index=True)
Output:
title project_details link_wp link_infopage project_description
0 Download Magazine Promo for Element 3D – FREE ... \nMagazine Promo for Element 3D 23030644 Video... https://previews.customer.envatousercontent.co... Buy it \nFree Download\n\n\n\n\n\n\nRelated Templates...
To create new DataFrame from the data you can try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://aedownload.com/download-magazine-promo-for-element-3d-free-videohive/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(class_="blog-title").text.strip()
project_details = soup.find(class_="project-details").text
link_wp = soup.find(class_="wp-video-shortcode").text
link_infopage = soup.find(class_="infopage112").text
project_description = soup.find(class_="Project-discription").text
df = pd.DataFrame(
{
"title": [title],
"project_details": [project_details],
"link_wp": [link_wp],
"link_infopage": [link_infopage],
"project_description": [project_description],
}
)
df.to_csv("data.csv", index=False)
Saves data.csv (screenshot from LibreOffice):
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'http://h1.nobbd.de/index.php?start='
for page in range(1,10):
req = requests.get(URL + str(page) + '=')
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data = {"Company": [layanan], "Title:": [report], "Submit:": [bug_hunter], "Link:": [mirror], "Date:": [waktu]}
df = pd.DataFrame(data)
my result only get 1 data, can you help me for get more data and save another file?
df.head()
index
Company
Title:
Submit:
Link:
Date:
0
Reddit
Application level DOS at Login Page ( Accepts Long Password )
e100_speaks
https://hackerone.com/reports/1168804
03 Feb 2022
What happens?
Based on your questions code, you will overwrite your dataframe with every iteration, thats why you only get one result.
How to fix?
Create an empty list before your loops
Append all the extracted dicts to this list
Create your dataframe based on that list of dicts
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
url = 'http://h1.nobbd.de/index.php?start='
for page in range(1,3):
req = requests.get(url + str(page))
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data.append({'Company':[layanan], 'Title':[report], 'Submit':[hunter], 'link':[mirror], 'Date':[waktu]})
df = pd.DataFrame(data)
My program returns different numbers each time. If I run each page individually it gives out the right results. I wanted to get all the links which have 3 or more votes.
from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]
while pg<=number_of_pages:
print("In Page :"+str(pg))
url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src, 'html.parser')
a_links = soup.findAll('a',{'class':'question-hyperlink'})
span_links = soup.findAll('span',{'class':'vote-count-post'})
hrefs = []
for a_link in a_links:
hrefs.append(a_link.get('href'))
for link in range(len(span_links)):
vote = span_links[link].strong.text
n = int(vote)
if n>2:
the_link = 'https://stackoverflow.com' + hrefs[link]
qualified.append(the_link)
print(len(qualified))
pg +=1
print(len(qualified)) will show the length of the full list that is your error. You can get how many links in each by adding i = 0 after while pg<=number_of_pages: and i += 1 after if n>2: then add print(i) before or after pg +=1.
Then code will be like this:
from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]
while pg<=number_of_pages:
i = 0
print("In Page :"+str(pg))
url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src, 'html.parser')
a_links = soup.findAll('a',{'class':'question-hyperlink'})
span_links = soup.findAll('span',{'class':'vote-count-post'})
hrefs = []
for a_link in a_links:
hrefs.append(a_link.get('href'))
for link in range(len(span_links)):
vote = span_links[link].strong.text
n = int(vote)
if n>2:
i += 1
the_link = 'https://stackoverflow.com' + hrefs[link]
qualified.append(the_link)
print(i)
pg +=1
#print(qualified)
Output:
6
In Page :1
1
In Page :2
4
In Page :3
2
In Page :4
3
In Page :5
2
In Page :6
2
How to sprape multiple pages in excel.
for example i want to scrape "http://econpy.pythonanywhere.com/ex/001.html"
how to scrape next pages considering number of pages are unknown
Plus i have written a code it prints nonetype in excel but not the data
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1, Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
salmon = print(Header1.get_text())
salam = print(Header2.get_text())
f.write("{}".format(salmon)+ "{}".format(salam))
f.close()
What i am doing wrong?
Give this a try and let me know if you have any issues. I used "css selector" and "requests" for the operation to be accomplished.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('Mynew.csv', 'w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Name","Price"])
for page in range(1,6):
html = requests.get("http://econpy.pythonanywhere.com/ex/00{0}.html".format(page))
soup = BeautifulSoup(html.text, "html.parser")
for item in soup.select("div[title=buyer-info]"):
Header1 = item.select_one("div[title=buyer-name]").get_text()
Header2 = item.select_one("span.item-price").get_text()
writer.writerow([Header1, Header2])
print(Header1,Header2)
outfile.close()
i got it solved till 1st page... and this is the code
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1,Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
f.write('{},{}\n'.format(Header1.text, Header2.text))
f.close()
now the pain comes how to spacre for mulitple pages , means how to scrape next pages also?
i have written a code that scrapes the websites: https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
but when i run this code, data is not formtted, like product name is coming in ever cell and so on price and image.
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("Scrapedetails.csv", "w")
Headers = "Item_Name, Price, Image\n"
f.write(Headers)
for page in range(1,15):
page_url = "https://www.newegg.com/Product/ProductList.aspx?
Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
html = urlopen(page_url)
bs0bj = BeautifulSoup(html, "html.parser")
page_details = bs0bj.find_all("div", {"class":"item-container"})
for i in page_details:
Item_Name = i.find("a", {"class":"item-title"})
Price = i.find("li", {"class":"price-current"})
Image = i.find("img")
Name_item = Item_Name.get_text()
Prin = Price.get_text()
imgf = Image["src"]# to get the key src
f.write("{}".format(Name_item).strip()+ ",{}".format(Prin).strip()+
",{}".format(imgf)+ "\n")
f.close()
can someone help me to ammend codes so that i can get name in name column, price in price column and image in image column.
what are the new ways to save data in csv,can someone help me in it with codes too?
Alright i got it solved.
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("Scrapedetails.csv", "w")
Headers = "Item_Name, Price, Image\n"
f.write(Headers)
for page in range(1,15):
page_url = "https://www.newegg.com/Product/ProductList.aspx?
Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=
{}&PageSize=36&order=BESTMATCH".format(page)
html = urlopen(page_url)
bs0bj = BeautifulSoup(html, "html.parser")
page_details = bs0bj.find_all("div", {"class":"item-container"})
for i in page_details:
Item_Name = i.find("a", {"class":"item-title"})
Price = i.find("li", {"class":"price-current"}).find('strong')
Image = i.find("img")
Name_item = Item_Name.get_text().strip()
prin = Price.get_text()
imgf = Image["src"]# to get the key src
print(Name_item)
print(prin)
print('https:{}'.format(imgf))
f.write("{}".format(Name_item).replace(",", "|")+ ",{}".format(prin)+ ",https:{}".format(imgf)+ "\n")
f.close()
These are the codes for anyone who wishes to start with webscraping a simplest way