I am trying to scrape the following page:
https://apps.fcc.gov/oetcf/tcb/reports/Tcb731GrantForm.cfm?mode=COPY&RequestTimeout=500&tcb_code=&application_id=ll686%2BwlPnFzHQb6tru2vw%3D%3D&fcc_id=QDS-BRCM1095
headers_initial = {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9,de;q=0.8',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
}
r = requests.get(url, timeout=100, headers=headers_initial)
print(r.status_code)
print(r.headers)
print(r.text)
my status code is 400
my requests.get gets hung up. I would be very appreciative of any help someone can provide.
Related
I am trying to webscrape some price information from a local supermarket. I am being denied access to the site, and not sure why... I have updated my user identity to be that of google chrome but am still getting the same error. Thanks!
library(rvest)
library(dplyr)
link <- "https://www.paknsave.co.nz/shop/product/5031015_ea_000pns?name=size-7-eggs"
page <- GET(link, add_headers('user-agent' = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"))
import requests
cookies = {
'__cf_bm': 'HrPgHXxwZ21ZHce6g119GTh0TW5PLK226Avwsqr_yRk-1657420237-0-AV3AFcbB1RRPQi9sj9f0jlyEtnLOU3joTSTqvIuc0StLeyezQdAJDeSSBWpSuxYxQLz6k7KvDjIKR4dPCPww4nxztaohWaWLgKR8wJw1OopkzNjFT7V/MPgZknXPuL4W0B//cUxLgOniMWzJyUqDjAPqJ3fIVNZykHBsk3kWx+krXKDl/xVcmgfD0X8HnQoBtw==',
'cf_chl_2': '6362dc2388c492e',
'cf_chl_prog': 'x13',
'cf_clearance': '7Q36fdlfvE_xpzRSuN425iQrAXi0K6t9oMEg9bgBl1E-1657420230-0-150',
'shell#lang': 'en',
'SessionCookieIdV2': '2f331eba017f4978a21db30d38bd58bd',
'SC_ANALYTICS_GLOBAL_COOKIE': '75db5ec972684f1d83a298be947ff26f|False',
'server_nearest_store_v2': '{"StoreId":"3c5e3145-0767-4066-9349-6c0a1313acc5","UserLat":"37.7697","UserLng":"-122.3933","StoreLat":"-35.09945","StoreLng":"173.258322","IsSuccess":true}',
'__RequestVerificationToken': 'i7yGKUCMmP0LpzH6Ir9q8Tin79X0zz2C9mzoUh_VUyNxQNWZ-Gm64inb2J8yRT7C89VdUZc85pIIztehy5ypTrgxBmU1',
'STORE_ID_V2': '3c5e3145-0767-4066-9349-6c0a1313acc5|False',
'Region': 'NI',
'AllowRestrictedItems': 'true',
'sxa_site': 'PAKnSAVE',
'__cfruid': '8f13df268c53d03a3b3440e47baa5df4671d278d-1657420232',
'_gcl_au': '1.1.1855441244.1657420235',
'_ga_8ZFCCVKEC2': 'GS1.1.1657420235.1.1.1657420235.60',
'_ga': 'GA1.1.444441072.1657420235',
'FPLC': 'G6JkKZ86eQgLbN2PTg5DU9nts8HFZj2ZdPTjM6VTo6Johf6YgbfYcZZVDcnxgUmYN%2FdRRR6%2Fz4mEDQIYWroUc8Rhy5%2BXkehpQlNuUN%2Bd11JsFx8S%2FzyGohu9wvfYeA%3D%3D',
'FPID': 'FPID2.3.pLYyjOkBCu9gt8rah2k%2BxfEuOt1pMJfZ%2Fg7VwV%2Fwsy8%3D.1657420235',
'ASP.NET_SessionId': '1rzzw1ls1vagg4fdeayflrm0',
'fs-store-select-tooltip-closed': 'true',
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.paknsave.co.nz/shop/product/5031015_ea_000pns?name=size-7-eggs&__cf_chl_tk=ZpR7svpE5x07zN1HC3SVHKDAAVXTtdqPUtgz1pBfj.A-1657420229-0-gaNycGzNCD0',
'Origin': 'https://www.paknsave.co.nz',
'DNT': '1',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': '__cf_bm=HrPgHXxwZ21ZHce6g119GTh0TW5PLK226Avwsqr_yRk-1657420237-0-AV3AFcbB1RRPQi9sj9f0jlyEtnLOU3joTSTqvIuc0StLeyezQdAJDeSSBWpSuxYxQLz6k7KvDjIKR4dPCPww4nxztaohWaWLgKR8wJw1OopkzNjFT7V/MPgZknXPuL4W0B//cUxLgOniMWzJyUqDjAPqJ3fIVNZykHBsk3kWx+krXKDl/xVcmgfD0X8HnQoBtw==; cf_chl_2=6362dc2388c492e; cf_chl_prog=x13; cf_clearance=7Q36fdlfvE_xpzRSuN425iQrAXi0K6t9oMEg9bgBl1E-1657420230-0-150; shell#lang=en; SessionCookieIdV2=2f331eba017f4978a21db30d38bd58bd; SC_ANALYTICS_GLOBAL_COOKIE=75db5ec972684f1d83a298be947ff26f|False; server_nearest_store_v2={"StoreId":"3c5e3145-0767-4066-9349-6c0a1313acc5","UserLat":"37.7697","UserLng":"-122.3933","StoreLat":"-35.09945","StoreLng":"173.258322","IsSuccess":true}; __RequestVerificationToken=i7yGKUCMmP0LpzH6Ir9q8Tin79X0zz2C9mzoUh_VUyNxQNWZ-Gm64inb2J8yRT7C89VdUZc85pIIztehy5ypTrgxBmU1; STORE_ID_V2=3c5e3145-0767-4066-9349-6c0a1313acc5|False; Region=NI; AllowRestrictedItems=true; sxa_site=PAKnSAVE; __cfruid=8f13df268c53d03a3b3440e47baa5df4671d278d-1657420232; _gcl_au=1.1.1855441244.1657420235; _ga_8ZFCCVKEC2=GS1.1.1657420235.1.1.1657420235.60; _ga=GA1.1.444441072.1657420235; FPLC=G6JkKZ86eQgLbN2PTg5DU9nts8HFZj2ZdPTjM6VTo6Johf6YgbfYcZZVDcnxgUmYN%2FdRRR6%2Fz4mEDQIYWroUc8Rhy5%2BXkehpQlNuUN%2Bd11JsFx8S%2FzyGohu9wvfYeA%3D%3D; FPID=FPID2.3.pLYyjOkBCu9gt8rah2k%2BxfEuOt1pMJfZ%2Fg7VwV%2Fwsy8%3D.1657420235; ASP.NET_SessionId=1rzzw1ls1vagg4fdeayflrm0; fs-store-select-tooltip-closed=true',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
# Requests doesn't support trailers
# 'TE': 'trailers',
}
params = {
'name': 'size-7-eggs',
}
data = {
'md': 'LY3z3moXjvkiL6.TltBjKutBlxr0gRcWotnWZD224Ik-1657420229-0-AdVFW-EtYzqbcg7Spq0beYQxr56eQ35wUZByyeUdPhP2RYKPi-G5qDV3BcVp9a-cMTclDfdEhbvXZLuhffGQLmLiSva5afHqpVZZYRepw3ej7SDL2x_vpDpT7yDzSdOVhiRIYWNgG82LWigFM5t7GPoG9XgJTDbpt7exsP4fbjENcCSQCGPhzI8H1FZVDUmRLDMMRLSFECC_ntCat-xMaNN1-LMQnqb_ASBKE7tQzjtFlZc3Uix4SsRbeZqs1CWJWdVsRMfm8jNh9hhG9NuMIq2ggRZGd7r1va7C8m1aj1UdlbnzM2juswggBe-1J1gMF6ZFrjmbiulBfe-HSwf3h65MlDrX63uJTU4XCg62A9HMGq_5t2IcNa8V93H4fLeJEI-KMsmHmhM-gE-VHHUV1ygSyaK1RQQvDNVF2K9QRFYaMZBc0rjMaJsZd8tiU5vXW4xEAWKDvxZHSAkXqklXpKY58VTudkiRw_xrcAzIkGotTZ3okQwAIV4BFspJOO6ir9yx4MIyjPr53rGvqQEOSa24GHlpAm8EEojo3FGbu_YUX5vjjptyItyM-juiyxqdiWx7dKA1gY-KJjwNpVKYhfAfLgH7EU86WUTPHZK2Zkx1d3URpbaKnll47i18d-dSnfuWwt1NwAv_rcr_tFdC2cdYxoebGLMqd7DIKdRR7BgNve_lOxnVjv6-tS7eIoOuCw10FX3HN_mVv7ez8RXLYKYlbMFeKw_tbUNqRayHFsjifNPwr0nkZI_dCpxfwc56pYIWLprD4GbRvMW8DLvb9780wJteNcbw3lUAwljK_lVX07rqC69W_SEzU_Tx7SA5XA',
'r': 'NhJKZcFC0PI8V8pbbsMGnQLopbV2aQcLxBSjbGH0Rnk-1657420229-0-AV1Uyb7KhY0yniBW0nJsIQ9n0cm+m8jbKXLtyUVB8dsxtMpVww0d5vQUlTu06beYL4XRKa9x8nz8ddLZzJhz/rW61z8Bax45FL5KQqVdrnC5Ki3ul+MZnmLwBC1Do1DYP2837DScEbbPB9lKtzv0M6M0+plSLMwyYKolemCmp3vUI3DWKvvv0SLBP1R/hzqGH56HuR/wTrJEv2mmjOUiqa+FyG9Go6wMIE57FV3WmtaRHp2ZE+QtpgLw8o9l6KUN8wZByC+NfSfBLDDK1ofEtb0aDXPPjr+JfgMeM1rigJOGxsYTN8tcdoQA7B/wvA/xWGJ+V49AoJOWo1pXm0WqXenbsHYbFwYYT8wVoiUAMod6uxPKqxJYGrOJkI6N28k/WjcRQaJ4Tbz6hR18WBN8xayfGtTvfc2vVHpfZGzI2BjKIbGVQ6UL6mgGFsQZ16UDfX0FyDOkvDRtd3Q645K8l9oUt1++PLoEQxmOP21FegAjIRFVy1+WfgKVJgubsnpNRIOxZ+U9EZbgRTDjJO+ruPzzUFPixw74ZNCInHVNE4xwpUWndqDTS17RKX/ZR3auc1rltgtJGHrVFzATgLjAroAhSyy24ddDpvGcRXZZSoaI+X6bXf7A0UVh2BxPvPYmbwsxTXblYxve6enVJvN5drt+n+0nVzoih3VhMHuQqDLebFn+Yfq4OJmVgpq0yXuiGM++JPY/5H8bicbKiCMjG+JSUSJOyJoBItscitorPNSyJC6OX1laLmXarxXbLd2AXUMhXAteXoToSDm1gUbCOkYBR/1q7lv/QuBJBiFV9Rnt3zVRsqNPNGzlz6CzTVPpHGU209I21lA79VYTtnLkAjWVRE/PRARD4qK4JdzAvnJyI8xoBGebUYs+nySaewIMnjslXPQSYLisF53fNLwkcjUoZqq1qMEmw7Wc2fq6DB/9MFTa7ZVc9luxy8mbdyRAeh9XXOfbUNhwp0RYaC9ps0pptrj/2e7FJOhe/r63h+DoA3LhSh8JOc40SGE6ayfsgr5FVAmUwsFvQE2sYuCI4GaULBP+tVkhOrEY9793n09AM9ljQ7Cr2dV+0p80xQdzy7td7pEVOa/qw4IvYPTBLWGHjBacJiON0ARj+uO0RdWR7MSJP/6WGvvF01Tbcdd12Ss4JzqYqc+sDJ9VjqaqawOW79JI7DjUYXPhJqJ6iGPxMDLe939qTpystXf6Fvi3ZGovpBru0aMFlCmTU/HwtkwAG3G5Hzg2GFFr2ViuYzB1TrGzzGDmbOwuWEG6p6l/WCeuY8l5f/NfqTq8oLaGCiDYr9sbJL4EOHbJZ+6tcaoQxD5xm+Yd3jskCqk6MY7vGARUrof/Wl0GhU8znpVZeDa7wKmzGd6XGYG/gJKnM6rOf3I/sEnY8HJ5Hj9o7bZ52x9N80DwPJbGbTvVG9JR9pE1B0MPqrUUM1Omkh2aUh/Co7qAf2qC3aeTBLbwKwXN6TcB6S1yOGcvNMT+eKbdMpA1Ac0YjvD0b1t3/SlK3pkx10kBhXJ3HE0bj/WiqmHNW/OX3FiT7B06ynF+rKPrUPKqQ089/rThZ+VAheq7KveUxJtVXAwkOwe0xn7hk5HuhmKLq1i8psr1eFU9IJYmSB8QENvZ1k4ZOUdBbBZxBeMpA8iA2pu57E/+hTCDvjdpxxETwu84Y1sEHxVO80Qsir25+DDemFMiVi9DRUlyaiZ43dHC/qhrb4TEQiRWpYTpOrjv7Z0YPZUm5O3Q5hyXYfpgeuJ1+0JHLz/KH0U0lNLynMAAjyypipScAruzr25YGHAGsexzTwoQRoVED6nNRbc/4hQcFdNhRIyhd1aDNDkkzOC3gKPn8kjpFQqVmoAQU8Yfv6BohhHMyon5+sNV3Fdp1/az30lILeriDWU7KoL70nmvdyBcmboUyGesJS4GPWAe67E0sU9NLcZF6LzoP1YUmdd0FQZ7wvisAg2yJyBVwXD+eehpLE7gGeXEyAxr7DepYT8wwqEGk4Dcx+4AScviP84T8JKiDiWchaGW/GTjdc/5flgFa3BeR/4W94wDpQ==',
'vc': '1574e190db357034339f269c7c5755d0',
'captcha_vc': 'c3856b069d07c7e16a7767324ca6f885',
'captcha_answer': 'cppbodShTefN-13-7285e0b05b37b8af',
'cf_ch_cp_return': '9a10d0a1037bf2b325009ab7be973b18|{"managed_clearance":"ni"}',
}
response = requests.post('https://www.paknsave.co.nz/shop/product/5031015_ea_000pns', params=params, cookies=cookies, headers=headers, data=data)
test_url = 'https://crimegrade.org/safest-places-in-60629/'
test_headers = {
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': '_ga=GA1.2.1384046872.1654177894; _gid=GA1.2.924008640.1654177894',
'pragma': 'no-cache',
'referer' : 'https://crimegrade.org/crime-by-zip-code/',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests' : '1',
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'
}
crime_response = requests.get(test_url, headers=test_headers)
print(crime_response.content)
I've managed to scrape other websites with a similar approach before, but I haven't been able to get parameters or a clean 200 status code for crimegrade.org. I think that's why I'm getting this response:
\<div class="cf-alert cf-alert-error cf-cookie-error" id="cookie-alert" data-translate="enable_cookies">Please enable cookies.\</div>
Do you have any advice on how to solve this?
Through a bit more reading, watching, & hunting on my end, I managed to get around this with a very conventional method of automating my browsing with Selenium. My code is below.
Note: .page_source gives the html data which can be parsed with BeautifulSoup. It is akin to the .content yield in my original post, except it's the information I need.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
crime_url = 'https://crimegrade.org/safest-places-in-73505/'
chrome_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
chrome_driver.get(crime_url)
crime_html = chrome_driver.page_source
chrome_driver.quit()
I'm trying to do a simple get request but no matter how I'm configuring the headers I keep getting a 403 response. The page loads fine in a browser. No login is required and there are no tracked cookies either. The link I'm trying to get a response from is below, followed by my simple code.
https://i7.sportsdatabase.com/nba/query.json?sdql=50+%3C+Kobe+Bryant%3Apoints+and+site%3Daway&sport=nba
url = 'https://i7.sportsdatabase.com/nba/query.json?sdql=50+%3C+Kobe+Bryant%3Apoints+and+site%3Daway&sport=nba'
headers = {
'Host': 'i7.sportsdatabase.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
r = requests.get(url, headers)
I'm not seeing any other headers that need adding to the request. The full, in browser, request headers are below:
Host: i7.sportsdatabase.com
Connection: keep-alive
Cache-Control: max-age=0
sec-ch-ua: "Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"
sec-ch-ua-mobile: ?0
DNT: 1
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Sec-Fetch-Site: none
Sec-Fetch-Mode: navigate
Sec-Fetch-User: ?1
Sec-Fetch-Dest: document
Accept-Encoding: gzip, deflate, br
Accept-Language: en,en-US;q=0.9,it;q=0.8,es;q=0.7
If-None-Match: "be833f0fb26eb81487fc09e05c85ac8c8646fc7b"
Try:
Make your URL a string
Add the accepts
This works:
url = 'https://i7.sportsdatabase.com/nba/query.json?sdql=50+%3C+Kobe+Bryant%3Apoints+and+site%3Daway&sport=nba'
headers = {
'Host': 'i7.sportsdatabase.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
}
r = requests.get(url, headers=headers)
Try using .Session()
import requests
s = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36',
}
s.get('https://i7.sportsdatabase.com/nba/trends', headers=headers)
url = 'https://i7.sportsdatabase.com/nba/query.json?sdql=50+%3C+Kobe+Bryant%3Apoints+and+site%3Daway&sport=nba'
r = s.get(url, headers=headers)
print(r)
Output:
print(r)
<Response [200]>
I managed to get the soup and the html of the webpage, but for some reason can't find the robots tag even though I can find it when scraping in other languages.
Example:
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
'Cache-Control': 'max-age=0', 'Connection': 'keep-alive',
}
res=requests.get('http://{}'.format("radverdirect.com"), headers=headers, allow_redirects = True)
number= str(res.status_code)
soup = BeautifulSoup(res.text, 'html.parser')
x=soup.find('meta', attrs={'name':'robots'})
out=x.get("content", None)
out
This site returns to me noodp in other languages but here I can't find this tag. Why and how do I fix it?
the link is: https://angel.co/medical-marijuana-dispensaries-1
every time I use requests.get(url), it keeps giving me 403 response, so I cannot parse it
I tried changing the headers: user-agent and the referer but it did not work
import requests
page=requests.get('https://angel.co/medical-marijuana-dispensaries-1')
page
<Response [403]>
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'})
session.headers
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
page=requests.get('https://angel.co/medical-marijuana-dispensaries-1')
page
<Response [403]>
page=session.get('https://angel.co/medical-marijuana-dispensaries-1')
page
<Response [403]>