I use the mentioned code to scrape a specific page
from bs4 import BeautifulSoup
import requests
url = "https://www.mychoize.com/self-drive-car-rentals-pune/cars"
page = requests.get(url=)
print(page.history)
for resp in page.history:
print(resp.status_code, resp.url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = "product-box")
for list in lists:
title = list.find('h3', class_ = "margin-o ng-binding")
#print(title)
But it keeps scraping the homepage('https://www.mychoize.com').
In order to stop it from redirecting to homepage I tried the following code to explore the response history
from bs4 import BeautifulSoup
import requests
url = "https://www.mychoize.com/self-drive-car-rentals-pune/cars"
page = requests.get(url ,allow_redirects=True)
print(page.history)
for resp in page.history:
print(resp.status_code, resp.url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = "product-box")
for list in lists:
title = list.find('h3', class_ = "margin-o ng-binding")
#print(title)
I obtained the following output
[<Response [302]>, <Response [301]>]
302 https://www.mychoize.com/self-drive-car-rentals-pune/cars
301 http://www.mychoize.com/
How do I prevent it from redirecting?
Related
I'm new to web scraping and i was trying to scrape through FUTBIN (FUT 22) player database
"https://www.futbin.com/players" . My code is below and I don't know why if can't get any sort of results from the FUTBIN page but was successful in other webpages like IMDB.
CODE :`
import requests
from bs4 import BeautifulSoup
request = requests.get("https://www.futbin.com/players")
src = request.content
soup = BeautifulSoup(src, features="html.parser")
results = soup.find("a", class_="player_name_players_table get-tp`enter code here`")
print(results)
I'm attempting to scrape the data from a table on the following website: https://droughtmonitor.unl.edu/DmData/DataTables.aspx
import requests
from bs4 import BeautifulSoup
url = 'https://droughtmonitor.unl.edu/DmData/DataTables.aspx'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
drought_table = soup.find('table', {'id':'datatabl'}).find('tbody').find_all('tr')
for some reason I am getting no outputs. I've tried to use pandas for the same job
import pandas as pd
url = 'https://droughtmonitor.unl.edu/DmData/DataTables.aspx'
table = pd.read_html(url)
df = table[0]
But also ended up getting an empty dataframe.
What could be causing this?
By checking network tool of browser it's obvious site uses Fetch/XHR to load table in another request.
Image: network monitor
You can use this code to get table data:
import requests
import json
headers = {
'Content-Type': 'application/json; charset=utf-8',
}
params = (
('area', '\'conus\''),
('statstype', '\'1\''),
)
response = requests.get(
'https://droughtmonitor.unl.edu/DmData/DataTables.aspx/ReturnTabularDMAreaPercent_national',
headers=headers, params=params
)
table = json.loads(response.content)
# Code generated by https://curlconverter.com/
I'm trying to learn bs4 for past few days, I successfully scraped a page and print them in a text file so I try to scrape multiple pages and the results too print successfully in the terminal but when I try to print them in a text file only the last file get saved and rest of them are not executed. Since I'm new to coding I can't figure out the actual reason.
import bs4
import requests
from fake_useragent import UserAgent
import io
urls = ['https://en.m.wikipedia.org/wiki/Grove_(nature)','https://en.wikipedia.org/wiki/Azadirachta_indica','https://en.wikipedia.org/wiki/Olive']
user_agent = UserAgent()
for url in urls:
page = requests.get(url, headers={"user-agent": user_agent.chrome})
tree = bs4.BeautifulSoup(page.text, 'html.parser')
title = tree.find('title').get_text()
text = tree.find_all('p')[1].get_text()
name = title + '.txt'
with io.open(name, "w", encoding="utf-8") as text_file:
text_file.write(text)
print('files are ready')
You create the file outside the loop. Put the with statement in the for-loop like this:
import bs4
import requests
from fake_useragent import UserAgent
import io
urls = ['https://en.m.wikipedia.org/wiki/Grove_(nature)','https://en.wikipedia.org/wiki/Azadirachta_indica','https://en.wikipedia.org/wiki/Olive']
user_agent = UserAgent()
for url in urls:
page = requests.get(url, headers={"user-agent": user_agent.chrome})
tree = bs4.BeautifulSoup(page.text, 'html.parser')
title = tree.find('title').get_text()
text = tree.find_all('p')[1].get_text()
name = title + '.txt'
with io.open(name, "w", encoding="utf-8") as text_file:
text_file.write(text)
print('files are ready')
try this:
import bs4
import requests
from fake_useragent import UserAgent
import io
urls = ['https://en.m.wikipedia.org/wiki/Grove_(nature)','https://en.wikipedia.org/wiki/Azadirachta_indica','https://en.wikipedia.org/wiki/Olive']
user_agent = UserAgent()
for url in urls:
page = requests.get(url, headers={"user-agent": user_agent.chrome})
tree = bs4.BeautifulSoup(page.text, 'html.parser')
title = tree.find('title').get_text()
text = tree.find_all('p')[1].get_text()
name = title + '.txt'
with io.open(name, "w", encoding="utf-8") as text_file:
text_file.write(text)
print('files are ready')
I am trying to web scrape Yahoo's Finance Recommendation Rating using BeautifulSoup but it keeps returning 'None'.
E.g. Recommendation Rating for AAPL is '2'
https://finance.yahoo.com/quote/AAPL/analysis?p=AAPL
Please advise. Thank you!
Below is the code:
from requests import get
from bs4 import BeautifulSoup
tickers = ['AAPL']
url = 'https://sg.finance.yahoo.com/quote/%s/profile?p=%s'%(ticker, ticker)
print(url)
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
#yf_rec refers to yahoo finance recommendation
try:
yf_rec = html_soup.find('div', attrs={'class':'B(8px) Pos(a) C(white) Py(2px) Px(0) Ta(c) Bdrs(3px) Trstf(eio) Trsde(0.5) Arrow South Bdtc(i)::a Fw(b) Bgc($buy) Bdtc($buy)'}).text.strip()
except:
pass
print(yf_rec)
Im sure this is an easy one but somehow I ve been stuck to get the href link under the a tag that jumps to each of the product detail pages. I dont see any javascript wrapped around as well. What am I missing?
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
urls = [
'https://undefeated.com/search?type=product&q=nike'
]
final = []
with requests.Session() as s:
for url in urls:
driver = webdriver.Chrome('/Users/Documents/python/Selenium/bin/chromedriver')
driver.get(url)
products = [element for element in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='product-grid-item ']")))]
soup = bs(driver.page_source, 'lxml')
time.sleep(1)
href = soup.find_all['href']
print(href)
output:
[]
I then tried soup.find_all('a') and it did spit out the a whole bunch including href I am looking for, but still cannot specifically extract only the href...
You just have to find_all the a tag and then try to print the href attribute.
You requests.Session code should be like this:
with requests.Session() as s:
for url in urls:
driver = webdriver.Firefox()
driver.get(url)
products = [element for element in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='product-grid-item ']")))]
soup = bs(driver.page_source, 'lxml')
time.sleep(1)
a_links = soup.find_all('a')
for a in a_links:
print(a.get('href'))
Then all the links will be printed.