Beautiful Soup AttributeError [closed] - web-scraping

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 2 years ago.
Improve this question
import requests
from bs4 import BeautifulSoup
import pandas as pd
for n in range(1, 16):
response = requests.get(
'https://www.flipkart.com/search?q=books&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as'
'=off '
'&page=' + str(n))
soup = BeautifulSoup(response.text, 'html.parser')
# print(soup.prettify())
urls = list()
for a in soup.find_all('a', {'class': '_2cLu-l'}):
urls.append('https://www.flipkart.com' + a['href'])
# for a in soup.find_all('a', {'class': '_31qSD5'}):
# urls.append('https://www.flipkart.com' + a['href'])
#
# for a in soup.find_all('a', {'class': '_3dqZjq'}):
# urls.append('https://www.flipkart.com' + a['href'])
products = list()
for url in urls:
product = dict()
page_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
name = page_soup.find('h1', {'class': '_9E25nV'})
product['name'] = name.text
price = page_soup.find('div', {'class': '_1vC4OE _3qQ9m1'})
product['price'] = price.text
ratingsAndReviews = page_soup.find('span', {'class': '_38sUEc'})
if ratingsAndReviews is None:
product['ratingsAndReviews'] = '0 ratings & 0 reviews'
else:
product['ratingsAndReviews'] = ratingsAndReviews.text
products.append(product)
df = pd.DataFrame(products)
print(df)
df.to_csv(r'C:\Users\shiva\Desktop\Damn\Output_flipkart.csv', index=True)
product['name'] = name.text | AttributeError: 'NoneType' object has no attribute 'text'

Sometimes the website throws unwanted html in order to block you. So, put a try/except block in inner for loop. In that way even if you some urls don't work, it won't stop the program.
And put the code to convert to dataframe and saving out of the for loops
import requests
from bs4 import BeautifulSoup
import pandas as pd
products = list()
for n in range(1, 16):
response = requests.get(
'https://www.flipkart.com/search?q=books&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as'
'=off '
'&page=' + str(n))
soup = BeautifulSoup(response.text, 'html.parser')
urls = list()
for a in soup.find_all('a', {'class': '_2cLu-l'}):
urls.append('https://www.flipkart.com' + a['href'])
for url in urls:
try:
product = dict()
res = requests.get(url)
page_soup = BeautifulSoup(res.text, 'html.parser')
name = page_soup.find('h1', {'class': '_9E25nV'})
product['name'] = name.text
price = page_soup.find('div', {'class': '_1vC4OE _3qQ9m1'})
product['price'] = price.text
ratingsAndReviews = page_soup.find('span', {'class': '_38sUEc'})
if ratingsAndReviews is None:
product['ratingsAndReviews'] = '0 ratings & 0 reviews'
else:
product['ratingsAndReviews'] = ratingsAndReviews.text
products.append(product)
except Exception as e:
print(e)
df = pd.DataFrame(products)
df.to_csv("data.csv", index=False)

Related

How can I save this scrapped data into a data frame?

Hey this is my code that i used to scrap some data from website for practise. Can you help me set it into a data frame and save it?
url = "https://aedownload.com/download-magazine-promo-for-element-3d-free-videohive/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(class_="blog-title").text.strip()
project_details = soup.find( class_="project-details").text
link_wp = soup.find (class_="wp-video-shortcode").text
link_infopage = soup.find(class_="infopage112").text
project_description = soup.find(class_= "Project-discription").text
print(title)
print(project_details)
print(link_wp)
print(link_infopage)
print(project_description)
Create an empty dictionary and append items to dict1 and use pandas to create dataframe
dict1={}
dict1['title'] = soup.find(class_="blog-title").text.strip()
dict1['project_details'] = soup.find( class_="project-details").text
dict1['link_wp'] = soup.find (class_="wp-video-shortcode").text
dict1['link_infopage'] = soup.find(class_="infopage112").text
dict1['project_description'] = soup.find(class_= "Project-discription").text
import pandas as pd
df = pd.DataFrame()
df = df.append(dict1, ignore_index=True)
Output:
title project_details link_wp link_infopage project_description
0 Download Magazine Promo for Element 3D – FREE ... \nMagazine Promo for Element 3D 23030644 Video... https://previews.customer.envatousercontent.co... Buy it \nFree Download\n\n\n\n\n\n\nRelated Templates...
To create new DataFrame from the data you can try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://aedownload.com/download-magazine-promo-for-element-3d-free-videohive/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(class_="blog-title").text.strip()
project_details = soup.find(class_="project-details").text
link_wp = soup.find(class_="wp-video-shortcode").text
link_infopage = soup.find(class_="infopage112").text
project_description = soup.find(class_="Project-discription").text
df = pd.DataFrame(
{
"title": [title],
"project_details": [project_details],
"link_wp": [link_wp],
"link_infopage": [link_infopage],
"project_description": [project_description],
}
)
df.to_csv("data.csv", index=False)
Saves data.csv (screenshot from LibreOffice):

my result only get 1 data, can you help me for get more data and save another file?

import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'http://h1.nobbd.de/index.php?start='
for page in range(1,10):
req = requests.get(URL + str(page) + '=')
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data = {"Company": [layanan], "Title:": [report], "Submit:": [bug_hunter], "Link:": [mirror], "Date:": [waktu]}
df = pd.DataFrame(data)
my result only get 1 data, can you help me for get more data and save another file?
df.head()
index
Company
Title:
Submit:
Link:
Date:
0
Reddit
Application level DOS at Login Page ( Accepts Long Password )
e100_speaks
https://hackerone.com/reports/1168804
03 Feb 2022
What happens?
Based on your questions code, you will overwrite your dataframe with every iteration, thats why you only get one result.
How to fix?
Create an empty list before your loops
Append all the extracted dicts to this list
Create your dataframe based on that list of dicts
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
url = 'http://h1.nobbd.de/index.php?start='
for page in range(1,3):
req = requests.get(url + str(page))
soup = BeautifulSoup(req.text, 'html.parser')
h1 = soup.find_all('div',attrs={'class','report-wrapper'})
for hack in h1:
h2 = hack.find_all("div",attrs={"class","report"})
for i in h2:
layanan = i.find_all('b')[0].text.strip()
report = i.find_all('a')[2].text.strip()
bug_hunter = i.find_all('a')[1].text.strip()
mirror = i.find("a", {"class": "title"})['href']
date = i.find_all("div", {"class": "date"})
for d in date:
waktu = d.text
data.append({'Company':[layanan], 'Title':[report], 'Submit':[hunter], 'link':[mirror], 'Date':[waktu]})
df = pd.DataFrame(data)

Having problem with web scraping with bs4 in python

My program returns different numbers each time. If I run each page individually it gives out the right results. I wanted to get all the links which have 3 or more votes.
from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]
while pg<=number_of_pages:
print("In Page :"+str(pg))
url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src, 'html.parser')
a_links = soup.findAll('a',{'class':'question-hyperlink'})
span_links = soup.findAll('span',{'class':'vote-count-post'})
hrefs = []
for a_link in a_links:
hrefs.append(a_link.get('href'))
for link in range(len(span_links)):
vote = span_links[link].strong.text
n = int(vote)
if n>2:
the_link = 'https://stackoverflow.com' + hrefs[link]
qualified.append(the_link)
print(len(qualified))
pg +=1
print(len(qualified)) will show the length of the full list that is your error. You can get how many links in each by adding i = 0 after while pg<=number_of_pages: and i += 1 after if n>2: then add print(i) before or after pg +=1.
Then code will be like this:
from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]
while pg<=number_of_pages:
i = 0
print("In Page :"+str(pg))
url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src, 'html.parser')
a_links = soup.findAll('a',{'class':'question-hyperlink'})
span_links = soup.findAll('span',{'class':'vote-count-post'})
hrefs = []
for a_link in a_links:
hrefs.append(a_link.get('href'))
for link in range(len(span_links)):
vote = span_links[link].strong.text
n = int(vote)
if n>2:
i += 1
the_link = 'https://stackoverflow.com' + hrefs[link]
qualified.append(the_link)
print(i)
pg +=1
#print(qualified)
Output:
6
In Page :1
1
In Page :2
4
In Page :3
2
In Page :4
3
In Page :5
2
In Page :6
2

How to extract title of each product using python web scraping

Here is the link:https://www.118100.se/sok/foretag/?q=brf&loc=&ob=rel&p=0
def get_index_data(soup):
try:
links = soup.find_all('div','a',id=False).get('href')
except:
links = []
print(links)
Find all div, which has class name Name (class="Name"). which gives you all title names. If you want href then iterate through all titles and find a tag which has title is text of title.text.
import requests
import bs4 as bs
url = 'https://www.118100.se/sok/foretag/?q=brf&loc=&ob=rel&p=0'
response = requests.get(url)
# print('Response:', response.status_code)
soup = bs.BeautifulSoup(response.text, 'lxml')
titles = soup.find_all('div', {'class': 'Name'})
# a = soup.find_all('a')
# print(a)
for title in titles:
link = soup.find('a', {'title': title.text}).get('href')
print('https://www.118100.se' + link)

I'm Getting error json.decoder.JSONDecodeError: while running a python code

I have got this code from internet, for extracting data from justdial website.
While running this code I got the following error:
ERROR:json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) is shown.
Please help me to run this code as i'm not familiar with python. What changes should be done to run this code step by step.
Thank you in advance.
Here is my code:
import csv
import json
import requests
from bs4 import BeautifulSoup
print(25*"=")
print("Just Dial Scraper")
print(25*"=")
url = 'http://www.justdial.com/functions/ajxsearch.php?national_search=0&act'\
'=pagination&city={0}&search={1}&page={2}'
what = input("Enter your Query: ")
what = what.replace(' ', '+')
where = input("Enter the Location: ")
with open(what+"_"+where+'.csv', 'w') as f:
f.write('company, address, phone\n')
page = 1
while True:
print('Scraping Page', page)
resp = requests.get(url.format(where, what, page))
if not resp.json()['paidDocIds']:
print(25*"-")
print('Scraping Finished')
print(25*"-")
break
markup = resp.json()['markup'].replace('\/', '/')
soup = BeautifulSoup(markup, 'html.parser')
for thing in soup.find_all('section'):
csv_list = []
if thing.get('class') == ['jcar']:
# Company name
for a_tag in thing.find_all('a'):
if a_tag.get('onclick') == "_ct('clntnm', 'lspg');":
csv_list.append(a_tag.get('title'))
# Address
for span_tag in thing.find_all('span'):
if span_tag.get('class') == ['mrehover', 'dn']:
csv_list.append(span_tag.get_text().strip())
# Phone number
for a_tag in thing.find_all('a'):
if a_tag.get('href').startswith('tel:'):
csv_list.append(a_tag.get('href').split(':')[-1])
csv_list = ['"'+item+'"' for item in csv_list]
writeline = ','.join(csv_list)+'\n'
f.write(','.join(csv_list)+'\n')
page += 1

Resources