Sending post requests with Scrapy

Sending post requests with Scrapy - web-scraping

I'm learning how to do web scraping with Scrapy and I'm having problems with scraping dynamically loaded content. I'm trying to scrape a phone number from a website which sends a POST request in order to obtain the number:
This is the header of the Post request it sends:
Host: www.mymarket.ge
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://www.mymarket.ge/en/pr/16399126/savaWro-inventari/fulis-yuTi
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
X-Requested-With: XMLHttpRequest
Content-Length: 13
Origin: https://www.mymarket.ge
Connection: keep-alive
Cookie: Lang=en; split_test_version=v1; CookieID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJEYXRhIjp7IklEIjozOTUwMDY2MzUsImN0IjoxNTkyMzA2NDMxfSwiVG9rZW5JRCI6Ik55empxVStDa21QT1hKaU9lWE56emRzNHNSNWtcL1wvaVVUYjh2dExCT3ZKWT0iLCJJc3N1ZWRBdCI6MTU5MjMyMTc1MiwiRXhwaXJlc0F0IjoxNTkyMzIyMDUyfQ.mYR-I_51WLQbzWi-EH35s30soqoSDNIoOyXgGQ4Eu84; ka=da; SHOW_BETA_POPUP=B; APP_VERSION=B; LastSearch=%7B%22CatID%22%3A%22515%22%7D; PHPSESSID=eihhfcv85liiu3kt55nr9fhu5b; PopUpLog=%7B%22%2A%22%3A%222020-05-07+15%3A13%3A29%22%7D
and this is the body:
PrID=16399126
I successfully managed to replicate the post request on reqbin.com, but can't figure out how to do it with Scrapy. This is what my code looks like:
class MymarketcrawlerSpider(CrawlSpider):
name = "mymarketcrawler"
allowed_domains = ["mymarket.ge"]
start_urls = ["http://mymarket.ge/"]
rules = (
Rule(
LinkExtractor(allow=r".*mymarket.ge/ka/*", restrict_css=".product-card"),
callback="parse_item",
follow=True,
),
)
def parse_item(self, response):
item_loader = ItemLoader(item=MymarketItem(), response=response)
def parse_num(response):
try:
response_text = response.text
response_dict = ast.literal_eval(response_text)
number = response_dict['Data']['Data']['numberToShow']
nonlocal item_loader
item_loader.add_value("number", number)
yield item_loader.load_item()
except Exception as e:
raise CloseSpider(e)
yield FormRequest.from_response(
response,
url=r"https://www.mymarket.ge/ka/pr/ShowFullNumber/",
headers={
"Host": "www.mymarket.ge",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.mymarket.ge/ka/pr/16399126/savaWro-inventari/fulis-yuTi",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
},
formdata={"PrID": "16399126"},
method="POST",
dont_filter=True,
callback=parse_num
)
item_loader.add_xpath(
"seller", "//div[#class='d-flex user-profile']/div/span/text()"
)
item_loader.add_xpath(
"product",
"//div[contains(#class, 'container product')]//h1[contains(#class, 'product-title')]/text()",
)
item_loader.add_xpath(
"price",
"//div[contains(#class, 'container product')]//span[contains(#class, 'product-price')][1]/text()",
TakeFirst(),
)
item_loader.add_xpath(
"images",
"//div[#class='position-sticky']/ul[#id='imageGallery']/li/#data-src",
)
item_loader.add_xpath(
"condition", "//div[contains(#class, 'condition-label')]/text()"
)
item_loader.add_xpath(
"city",
"//div[#class='d-flex font-14 font-weight-medium location-views']/span[contains(#class, 'location')]/text()",
)
item_loader.add_xpath(
"number_of_views",
"//div[#class='d-flex font-14 font-weight-medium location-views']/span[contains(#class, 'svg-18')]/span/text()",
)
item_loader.add_xpath(
"publish_date",
"//div[#class='d-flex left-side']//div[contains(#class, 'font-12')]/span[2]/text()",
)
item_loader.add_xpath(
"total_products_amount",
"//div[contains(#class, 'user-profile')]/div/a/text()",
re=r"\d+",
)
item_loader.add_xpath(
"description", "//div[contains(#class, 'texts full')]/p/text()"
)
item_loader.add_value("url", response.url)
yield item_loader.load_item()
The code above doesn't work; The number field is not populated.
I can print out the number to the screen, but unable to save it to the csv file. The number column in the csv file is blank, it doesn't contain any values.

Scrapy works asynchronously and every link to crawl, every item to process, etc. is put inside a queue. That is why you yield a request and wait for a SpiderDownloader, ItemPipeline, etc. to process your request.
What is happening is that you have requests that are processed seperately and that is why you don't see your results. Personally I would parse the results from the first request, save them in the 'meta' data and pass them to the next request, so that the data is available afterwards.
E.g.
class MymarketcrawlerSpider(CrawlSpider):
name = "mymarketcrawler"
allowed_domains = ["mymarket.ge"]
start_urls = ["http://mymarket.ge/"]
rules = (
Rule(
LinkExtractor(allow=r".*mymarket.ge/ka/*", restrict_css=".product-card"),
callback="parse_item",
follow=True,
),
)
def parse_item(self, response):
def parse_num(response):
item_loader = ItemLoader(item=MymarketItem(), response=response)
try:
response_text = response.text
response_dict = ast.literal_eval(response_text)
number = response_dict['Data']['Data']['numberToShow']
# New part:
product = response.meta['product']
# You won't need this now: nonlocal item_loader
# Also new:
item_loader.add_value("number", number)
item_loader.add_value("product", product)
yield item_loader.load_item()
except Exception as e:
raise CloseSpider(e)
# Rewrite your parsers like this:
product = response.xpath(
"//div[contains(#class, 'container product')]//h1[contains(#class, 'product-title')]/text()"
).get()
yield FormRequest.from_response(
response,
url=r"https://www.mymarket.ge/ka/pr/ShowFullNumber/",
headers={
"Host": "www.mymarket.ge",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.mymarket.ge/ka/pr/16399126/savaWro-inventari/fulis-yuTi",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
},
formdata={"PrID": "16399126"},
method="POST",
dont_filter=True,
callback=parse_num,
meta={"product": product}
)

Related

Scrapy parsed unknown character

I have wanted to scrape the site https://www.bikebd.com/brand/yamaha/ . here is my script
import scrapy
from scrapy.utils.response import open_in_browser
from urllib.parse import urlencode
class BikebdSpider(scrapy.Spider):
name = 'bikebd'
allowed_domains = ['www.bikebd.com']
headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,da;q=0.8',
'cache-control':'no-cache',
'cookie': '_ga=GA1.1.1549289426.1669609851; XSRF-TOKEN=eyJpdiI6IjhCN1BnV0RoK3dOQnFEQlhYRUZVZEE9PSIsInZhbHVlIjoiTFQ4Ym15MWhoU1hmR3FxaWdVYnkvbnovMTVDbS9iRm1OVCsrV0F2RzA5dHlmMWpObENoSFBWY0VXclBNWkZaNlV1aitwSXBWNjhNMGs2Z3JqQ3ZvQWVIQ25QcnNOZkNpR3lwMGNkL01aWHM3VDZ5YmZJblRha0kyUk5IMTh2UzQiLCJtYWMiOiJjMzFmMDZlZDFjNzVhNTVlZjY1MWEzNWJkZjY5Y2Q1MjFiZmNmM2UxOWRiZWJlMGRhZWY5OGU0MGQ4OWI5N2ViIiwidGFnIjoiIn0%3D; bikebd_session=eyJpdiI6ImVVb2NqcmFLR2dKSXc2NnNqUlV6ZWc9PSIsInZhbHVlIjoibUVNcEZidUxsbWdkK3c2UDFYdDYwcHFOdVU1WmVXY0ZiV1pHRzJBbzlaUDNuWGl2Vk1OTk5QYnRkdmVXdDg3bEx2SEpiMGE1c2dvakdkU0tQOTBucHc5ajRpcGpod2ViL3B2ME9DRXc4SUFtSG56YU9MVTdEVi9rYW8reXk0TDYiLCJtYWMiOiI5MmU2NWEyZDhkOGFiNTdkYzQ0ZGJhMDQwNzFhYzFmOGY4MzNjNWU2ODczYWNiOTVlNjU4MWUyZWVmMzE5NjNmIiwidGFnIjoiIn0%3D; _ga_HEG073JLWK=GS1.1.1670663205.2.1.1670663540.0.0.0',
'pragma': 'no-cache',
'referer': 'https://www.bikebd.com/bike-price-in-bd',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
def start_requests(self):
urls = ["https://www.bikebd.com/brand/yamaha"]
for url in urls:
yield scrapy.Request(url= url , callback=self.parse)
def parse(self, response):
container = response.xpath("//div[#class='tab-cards-rev']/div/div[#class='col-md-3']")
for item in container:
title = item.xpath(".//h2[#class='ltn-tittle mb-0']/strong/text()").get()
yield {'title' : title}
but when I crawl the script it returns none. Then I start debug the spider with open in browser by this line of code...
def parse(self, response):
open_in_browser(response)
Then it showed me some unreadable characters like below.
ôÿ‘šôC##tøœ÷ÿfjun_N?ÈKÛžDÜ$YVyË²UÒ–³TUoŸÄ'…8hI®ö,ëá4·9g¶åÝûtÎOûUéCh|â¢€Ð8`÷ D“†b&³“ÝªW¯ª~÷À"Á¹¹a]Ðøß¿{©wŽ(€è ¼ÇX¶nû»¥ˆŠ¦eÙËÿ«Íñ"åY³1Vÿõ¯³½ÍUDÃ±‡½â`¹‰½é½ê”§Œl‡%,{Š»È?8PaÐ-œ[·EÏ&Žl,ö‰êµŽÄ€ŠŒ+ŒMØèãG{L˜ž2 ?£?èaÂ´UWÞ$[0²üÃZ’‡N±ÅÔ%$[pÝ9ä[ ¯±ÖÞW(ñ¥-ˆxf¿ì±
What's going on the site? need some help.

How to get around 403 error when webscraping with R

I am trying to webscrape some price information from a local supermarket. I am being denied access to the site, and not sure why... I have updated my user identity to be that of google chrome but am still getting the same error. Thanks!
library(rvest)
library(dplyr)
link <- "https://www.paknsave.co.nz/shop/product/5031015_ea_000pns?name=size-7-eggs"
page <- GET(link, add_headers('user-agent' = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"))

import requests
cookies = {
'__cf_bm': 'HrPgHXxwZ21ZHce6g119GTh0TW5PLK226Avwsqr_yRk-1657420237-0-AV3AFcbB1RRPQi9sj9f0jlyEtnLOU3joTSTqvIuc0StLeyezQdAJDeSSBWpSuxYxQLz6k7KvDjIKR4dPCPww4nxztaohWaWLgKR8wJw1OopkzNjFT7V/MPgZknXPuL4W0B//cUxLgOniMWzJyUqDjAPqJ3fIVNZykHBsk3kWx+krXKDl/xVcmgfD0X8HnQoBtw==',
'cf_chl_2': '6362dc2388c492e',
'cf_chl_prog': 'x13',
'cf_clearance': '7Q36fdlfvE_xpzRSuN425iQrAXi0K6t9oMEg9bgBl1E-1657420230-0-150',
'shell#lang': 'en',
'SessionCookieIdV2': '2f331eba017f4978a21db30d38bd58bd',
'SC_ANALYTICS_GLOBAL_COOKIE': '75db5ec972684f1d83a298be947ff26f|False',
'server_nearest_store_v2': '{"StoreId":"3c5e3145-0767-4066-9349-6c0a1313acc5","UserLat":"37.7697","UserLng":"-122.3933","StoreLat":"-35.09945","StoreLng":"173.258322","IsSuccess":true}',
'__RequestVerificationToken': 'i7yGKUCMmP0LpzH6Ir9q8Tin79X0zz2C9mzoUh_VUyNxQNWZ-Gm64inb2J8yRT7C89VdUZc85pIIztehy5ypTrgxBmU1',
'STORE_ID_V2': '3c5e3145-0767-4066-9349-6c0a1313acc5|False',
'Region': 'NI',
'AllowRestrictedItems': 'true',
'sxa_site': 'PAKnSAVE',
'__cfruid': '8f13df268c53d03a3b3440e47baa5df4671d278d-1657420232',
'_gcl_au': '1.1.1855441244.1657420235',
'_ga_8ZFCCVKEC2': 'GS1.1.1657420235.1.1.1657420235.60',
'_ga': 'GA1.1.444441072.1657420235',
'FPLC': 'G6JkKZ86eQgLbN2PTg5DU9nts8HFZj2ZdPTjM6VTo6Johf6YgbfYcZZVDcnxgUmYN%2FdRRR6%2Fz4mEDQIYWroUc8Rhy5%2BXkehpQlNuUN%2Bd11JsFx8S%2FzyGohu9wvfYeA%3D%3D',
'FPID': 'FPID2.3.pLYyjOkBCu9gt8rah2k%2BxfEuOt1pMJfZ%2Fg7VwV%2Fwsy8%3D.1657420235',
'ASP.NET_SessionId': '1rzzw1ls1vagg4fdeayflrm0',
'fs-store-select-tooltip-closed': 'true',
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.paknsave.co.nz/shop/product/5031015_ea_000pns?name=size-7-eggs&__cf_chl_tk=ZpR7svpE5x07zN1HC3SVHKDAAVXTtdqPUtgz1pBfj.A-1657420229-0-gaNycGzNCD0',
'Origin': 'https://www.paknsave.co.nz',
'DNT': '1',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': '__cf_bm=HrPgHXxwZ21ZHce6g119GTh0TW5PLK226Avwsqr_yRk-1657420237-0-AV3AFcbB1RRPQi9sj9f0jlyEtnLOU3joTSTqvIuc0StLeyezQdAJDeSSBWpSuxYxQLz6k7KvDjIKR4dPCPww4nxztaohWaWLgKR8wJw1OopkzNjFT7V/MPgZknXPuL4W0B//cUxLgOniMWzJyUqDjAPqJ3fIVNZykHBsk3kWx+krXKDl/xVcmgfD0X8HnQoBtw==; cf_chl_2=6362dc2388c492e; cf_chl_prog=x13; cf_clearance=7Q36fdlfvE_xpzRSuN425iQrAXi0K6t9oMEg9bgBl1E-1657420230-0-150; shell#lang=en; SessionCookieIdV2=2f331eba017f4978a21db30d38bd58bd; SC_ANALYTICS_GLOBAL_COOKIE=75db5ec972684f1d83a298be947ff26f|False; server_nearest_store_v2={"StoreId":"3c5e3145-0767-4066-9349-6c0a1313acc5","UserLat":"37.7697","UserLng":"-122.3933","StoreLat":"-35.09945","StoreLng":"173.258322","IsSuccess":true}; __RequestVerificationToken=i7yGKUCMmP0LpzH6Ir9q8Tin79X0zz2C9mzoUh_VUyNxQNWZ-Gm64inb2J8yRT7C89VdUZc85pIIztehy5ypTrgxBmU1; STORE_ID_V2=3c5e3145-0767-4066-9349-6c0a1313acc5|False; Region=NI; AllowRestrictedItems=true; sxa_site=PAKnSAVE; __cfruid=8f13df268c53d03a3b3440e47baa5df4671d278d-1657420232; _gcl_au=1.1.1855441244.1657420235; _ga_8ZFCCVKEC2=GS1.1.1657420235.1.1.1657420235.60; _ga=GA1.1.444441072.1657420235; FPLC=G6JkKZ86eQgLbN2PTg5DU9nts8HFZj2ZdPTjM6VTo6Johf6YgbfYcZZVDcnxgUmYN%2FdRRR6%2Fz4mEDQIYWroUc8Rhy5%2BXkehpQlNuUN%2Bd11JsFx8S%2FzyGohu9wvfYeA%3D%3D; FPID=FPID2.3.pLYyjOkBCu9gt8rah2k%2BxfEuOt1pMJfZ%2Fg7VwV%2Fwsy8%3D.1657420235; ASP.NET_SessionId=1rzzw1ls1vagg4fdeayflrm0; fs-store-select-tooltip-closed=true',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
# Requests doesn't support trailers
# 'TE': 'trailers',
}
params = {
'name': 'size-7-eggs',
}
data = {
'md': 'LY3z3moXjvkiL6.TltBjKutBlxr0gRcWotnWZD224Ik-1657420229-0-AdVFW-EtYzqbcg7Spq0beYQxr56eQ35wUZByyeUdPhP2RYKPi-G5qDV3BcVp9a-cMTclDfdEhbvXZLuhffGQLmLiSva5afHqpVZZYRepw3ej7SDL2x_vpDpT7yDzSdOVhiRIYWNgG82LWigFM5t7GPoG9XgJTDbpt7exsP4fbjENcCSQCGPhzI8H1FZVDUmRLDMMRLSFECC_ntCat-xMaNN1-LMQnqb_ASBKE7tQzjtFlZc3Uix4SsRbeZqs1CWJWdVsRMfm8jNh9hhG9NuMIq2ggRZGd7r1va7C8m1aj1UdlbnzM2juswggBe-1J1gMF6ZFrjmbiulBfe-HSwf3h65MlDrX63uJTU4XCg62A9HMGq_5t2IcNa8V93H4fLeJEI-KMsmHmhM-gE-VHHUV1ygSyaK1RQQvDNVF2K9QRFYaMZBc0rjMaJsZd8tiU5vXW4xEAWKDvxZHSAkXqklXpKY58VTudkiRw_xrcAzIkGotTZ3okQwAIV4BFspJOO6ir9yx4MIyjPr53rGvqQEOSa24GHlpAm8EEojo3FGbu_YUX5vjjptyItyM-juiyxqdiWx7dKA1gY-KJjwNpVKYhfAfLgH7EU86WUTPHZK2Zkx1d3URpbaKnll47i18d-dSnfuWwt1NwAv_rcr_tFdC2cdYxoebGLMqd7DIKdRR7BgNve_lOxnVjv6-tS7eIoOuCw10FX3HN_mVv7ez8RXLYKYlbMFeKw_tbUNqRayHFsjifNPwr0nkZI_dCpxfwc56pYIWLprD4GbRvMW8DLvb9780wJteNcbw3lUAwljK_lVX07rqC69W_SEzU_Tx7SA5XA',
'r': 'NhJKZcFC0PI8V8pbbsMGnQLopbV2aQcLxBSjbGH0Rnk-1657420229-0-AV1Uyb7KhY0yniBW0nJsIQ9n0cm+m8jbKXLtyUVB8dsxtMpVww0d5vQUlTu06beYL4XRKa9x8nz8ddLZzJhz/rW61z8Bax45FL5KQqVdrnC5Ki3ul+MZnmLwBC1Do1DYP2837DScEbbPB9lKtzv0M6M0+plSLMwyYKolemCmp3vUI3DWKvvv0SLBP1R/hzqGH56HuR/wTrJEv2mmjOUiqa+FyG9Go6wMIE57FV3WmtaRHp2ZE+QtpgLw8o9l6KUN8wZByC+NfSfBLDDK1ofEtb0aDXPPjr+JfgMeM1rigJOGxsYTN8tcdoQA7B/wvA/xWGJ+V49AoJOWo1pXm0WqXenbsHYbFwYYT8wVoiUAMod6uxPKqxJYGrOJkI6N28k/WjcRQaJ4Tbz6hR18WBN8xayfGtTvfc2vVHpfZGzI2BjKIbGVQ6UL6mgGFsQZ16UDfX0FyDOkvDRtd3Q645K8l9oUt1++PLoEQxmOP21FegAjIRFVy1+WfgKVJgubsnpNRIOxZ+U9EZbgRTDjJO+ruPzzUFPixw74ZNCInHVNE4xwpUWndqDTS17RKX/ZR3auc1rltgtJGHrVFzATgLjAroAhSyy24ddDpvGcRXZZSoaI+X6bXf7A0UVh2BxPvPYmbwsxTXblYxve6enVJvN5drt+n+0nVzoih3VhMHuQqDLebFn+Yfq4OJmVgpq0yXuiGM++JPY/5H8bicbKiCMjG+JSUSJOyJoBItscitorPNSyJC6OX1laLmXarxXbLd2AXUMhXAteXoToSDm1gUbCOkYBR/1q7lv/QuBJBiFV9Rnt3zVRsqNPNGzlz6CzTVPpHGU209I21lA79VYTtnLkAjWVRE/PRARD4qK4JdzAvnJyI8xoBGebUYs+nySaewIMnjslXPQSYLisF53fNLwkcjUoZqq1qMEmw7Wc2fq6DB/9MFTa7ZVc9luxy8mbdyRAeh9XXOfbUNhwp0RYaC9ps0pptrj/2e7FJOhe/r63h+DoA3LhSh8JOc40SGE6ayfsgr5FVAmUwsFvQE2sYuCI4GaULBP+tVkhOrEY9793n09AM9ljQ7Cr2dV+0p80xQdzy7td7pEVOa/qw4IvYPTBLWGHjBacJiON0ARj+uO0RdWR7MSJP/6WGvvF01Tbcdd12Ss4JzqYqc+sDJ9VjqaqawOW79JI7DjUYXPhJqJ6iGPxMDLe939qTpystXf6Fvi3ZGovpBru0aMFlCmTU/HwtkwAG3G5Hzg2GFFr2ViuYzB1TrGzzGDmbOwuWEG6p6l/WCeuY8l5f/NfqTq8oLaGCiDYr9sbJL4EOHbJZ+6tcaoQxD5xm+Yd3jskCqk6MY7vGARUrof/Wl0GhU8znpVZeDa7wKmzGd6XGYG/gJKnM6rOf3I/sEnY8HJ5Hj9o7bZ52x9N80DwPJbGbTvVG9JR9pE1B0MPqrUUM1Omkh2aUh/Co7qAf2qC3aeTBLbwKwXN6TcB6S1yOGcvNMT+eKbdMpA1Ac0YjvD0b1t3/SlK3pkx10kBhXJ3HE0bj/WiqmHNW/OX3FiT7B06ynF+rKPrUPKqQ089/rThZ+VAheq7KveUxJtVXAwkOwe0xn7hk5HuhmKLq1i8psr1eFU9IJYmSB8QENvZ1k4ZOUdBbBZxBeMpA8iA2pu57E/+hTCDvjdpxxETwu84Y1sEHxVO80Qsir25+DDemFMiVi9DRUlyaiZ43dHC/qhrb4TEQiRWpYTpOrjv7Z0YPZUm5O3Q5hyXYfpgeuJ1+0JHLz/KH0U0lNLynMAAjyypipScAruzr25YGHAGsexzTwoQRoVED6nNRbc/4hQcFdNhRIyhd1aDNDkkzOC3gKPn8kjpFQqVmoAQU8Yfv6BohhHMyon5+sNV3Fdp1/az30lILeriDWU7KoL70nmvdyBcmboUyGesJS4GPWAe67E0sU9NLcZF6LzoP1YUmdd0FQZ7wvisAg2yJyBVwXD+eehpLE7gGeXEyAxr7DepYT8wwqEGk4Dcx+4AScviP84T8JKiDiWchaGW/GTjdc/5flgFa3BeR/4W94wDpQ==',
'vc': '1574e190db357034339f269c7c5755d0',
'captcha_vc': 'c3856b069d07c7e16a7767324ca6f885',
'captcha_answer': 'cppbodShTefN-13-7285e0b05b37b8af',
'cf_ch_cp_return': '9a10d0a1037bf2b325009ab7be973b18|{"managed_clearance":"ni"}',
}
response = requests.post('https://www.paknsave.co.nz/shop/product/5031015_ea_000pns', params=params, cookies=cookies, headers=headers, data=data)

How do I scrape a website that ignores my headers?

test_url = 'https://crimegrade.org/safest-places-in-60629/'
test_headers = {
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': '_ga=GA1.2.1384046872.1654177894; _gid=GA1.2.924008640.1654177894',
'pragma': 'no-cache',
'referer' : 'https://crimegrade.org/crime-by-zip-code/',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests' : '1',
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'
}
crime_response = requests.get(test_url, headers=test_headers)
print(crime_response.content)
I've managed to scrape other websites with a similar approach before, but I haven't been able to get parameters or a clean 200 status code for crimegrade.org. I think that's why I'm getting this response:
\<div class="cf-alert cf-alert-error cf-cookie-error" id="cookie-alert" data-translate="enable_cookies">Please enable cookies.\</div>
Do you have any advice on how to solve this?

Through a bit more reading, watching, & hunting on my end, I managed to get around this with a very conventional method of automating my browsing with Selenium. My code is below.
Note: .page_source gives the html data which can be parsed with BeautifulSoup. It is akin to the .content yield in my original post, except it's the information I need.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
crime_url = 'https://crimegrade.org/safest-places-in-73505/'
chrome_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
chrome_driver.get(crime_url)
crime_html = chrome_driver.page_source
chrome_driver.quit()

scrapy returns response.status 505

scrapy when trying to open the site returns response.status 505
505 HTTP Version Not Supported
The same site opens normally in the browser. Why might this be? How can this be fixed?
I call scrapy in console by this command line:
scrapy shell 'https://xiaohua.zol.com.cn/detail60/59411.html'

You should use proper headers to extract the data. here is a demo with output
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class Xiaohua(scrapy.Spider):
name = 'xiaohua'
start_urls = 'https://xiaohua.zol.com.cn/detail60/59411.html'
def start_requests(self):
headers = {
'authority': 'xiaohua.zol.com.cn',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'z_pro_city=s_provice%3Dmengjiala%26s_city%3Dnull; userProvinceId=1; userCityId=0; userCountyId=0; userLocationId=1; ip_ck=7sWD7/jzj7QuOTIyODI0LjE2MzQxMTQxNzg%3D; lv=1634114179; vn=1; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1634114179; _ga=GA1.3.116086394.1634114186; _gid=GA1.3.2021660129.1634114186; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1634114447; questionnaire_pv=1634083202; z_day=ixgo20%3D1%26icnmo11564%3D1; 22aa20c0da0b6f1d9a3155e8bf4c364e=cq11lgg54n27u10p%7B%7BZ%7D%7D%7B%7BZ%7D%7Dnull; MyZClick_22aa20c0da0b6f1d9a3155e8bf4c364e=/html/body/div%5B5%5D/div/div/div%5B2%5D/p/a/',
}
yield scrapy.Request(url= self.start_urls , callback=self.parse, headers=headers)
def parse(self, response):
print(response.status)
print('*'*10)
print(response.css('h1.article-title::text').get())
print(response.css('ul.nav > li > a::text').getall())
print('*'*10)
process = CrawlerProcess()
process.crawl(Xiaohua)
process.start()
output
200
**********
导演你能认真点儿吗
['笑话首页', '最新笑话', '冷笑话', '搞笑趣图', '搞笑视频', '上传笑话']
**********

Can't scrape robots in python using beautifulsoup

I managed to get the soup and the html of the webpage, but for some reason can't find the robots tag even though I can find it when scraping in other languages.
Example:
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
'Cache-Control': 'max-age=0', 'Connection': 'keep-alive',
}
res=requests.get('http://{}'.format("radverdirect.com"), headers=headers, allow_redirects = True)
number= str(res.status_code)
soup = BeautifulSoup(res.text, 'html.parser')
x=soup.find('meta', attrs={'name':'robots'})
out=x.get("content", None)
out
This site returns to me noodp in other languages but here I can't find this tag. Why and how do I fix it?

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Sending post requests with Scrapy - web-scraping

Related

Scrapy parsed unknown character

How to get around 403 error when webscraping with R

How do I scrape a website that ignores my headers?

scrapy returns response.status 505

Can't scrape robots in python using beautifulsoup

Categories

Resources