Why do a request work on requests but not on scrapy - web-scraping

I'm trying to scrape a webpage that loads the results for page 2 and so on when I scroll. So I get the url to the api (img) that it runs and it should work just fine.
But it only works if I use the requests lib. When i run requests.get() with the same url used with scrapy I get response 200, but with scrapy it returns 500 status. I don't know why this doesn't work with scrapy, any explanations for that?
Here's what I'm trying to do
Obrigado.
import scrapy
import json
import re
class ScrapeVagas(scrapy.Spider):
name = "vagas"
base_url = "https://www.trabalhabrasil.com.br/api/v1.0/Job/List?idFuncao=0&idCidade=5345&pagina=%d&pesquisa=&ordenacao=1&idUsuario="
start_urls = [base_url % 100]
download_delay = 1
def parse(self, response):
vagas = json.loads(response.text)
for vaga in range(0, len(vagas)):
yield {
"vaga": vagas[vaga]["df"],
"salario": re.sub("[R\$.]", "", vagas[vaga]["sl"]).strip()
}

Your are getting 500 Internal Server Error server error response code indicates that the server encountered an unexpected condition that prevented it from fulfilling the request.
here need Request header to get the proper response. See the output in scrapy shell.
import scrapy
base_url = "https://www.trabalhabrasil.com.br/api/v1.0/Job/List?idFuncao=0&idCidade=5345&pagina=%d&pesquisa=&o
rdenacao=1&idUsuario="
start_urls = [base_url % 100]
start_urls
url = start_urls[0]
headers = {"USER-AGENT":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.3",
"referer": "https://www.trabalhabrasil.com.br/vagas-empregos-em-sao-paulo-sp",
"authority": "www.trabalhabrasil.com.br",
"path": "/api/v1.0/Job/List?idFuncao=100&idCidade=5345&pagina=65&pesquisa=&ordenacao=1&idUsuario=",
"scheme": "https",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,bn;q=0.8",
"dnt": "1",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
}
r = scrapy.Request(url, headers=headers)
fetch(r)
2021-01-22 00:30:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.trabalhabrasil.com.br/api/v1.0/Job/List?idFuncao=0&idCidade=5345&pagina=100&pesquisa=&ordenacao=1&idUsuario=> (referer: https://www.trabalhabrasil.com.br/vagas-empregos-em-sao-paulo-sp)
In [19]: response.status
Out[19]: 200

Related

Scrapy parsed unknown character

I have wanted to scrape the site https://www.bikebd.com/brand/yamaha/ . here is my script
import scrapy
from scrapy.utils.response import open_in_browser
from urllib.parse import urlencode
class BikebdSpider(scrapy.Spider):
name = 'bikebd'
allowed_domains = ['www.bikebd.com']
headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,da;q=0.8',
'cache-control':'no-cache',
'cookie': '_ga=GA1.1.1549289426.1669609851; XSRF-TOKEN=eyJpdiI6IjhCN1BnV0RoK3dOQnFEQlhYRUZVZEE9PSIsInZhbHVlIjoiTFQ4Ym15MWhoU1hmR3FxaWdVYnkvbnovMTVDbS9iRm1OVCsrV0F2RzA5dHlmMWpObENoSFBWY0VXclBNWkZaNlV1aitwSXBWNjhNMGs2Z3JqQ3ZvQWVIQ25QcnNOZkNpR3lwMGNkL01aWHM3VDZ5YmZJblRha0kyUk5IMTh2UzQiLCJtYWMiOiJjMzFmMDZlZDFjNzVhNTVlZjY1MWEzNWJkZjY5Y2Q1MjFiZmNmM2UxOWRiZWJlMGRhZWY5OGU0MGQ4OWI5N2ViIiwidGFnIjoiIn0%3D; bikebd_session=eyJpdiI6ImVVb2NqcmFLR2dKSXc2NnNqUlV6ZWc9PSIsInZhbHVlIjoibUVNcEZidUxsbWdkK3c2UDFYdDYwcHFOdVU1WmVXY0ZiV1pHRzJBbzlaUDNuWGl2Vk1OTk5QYnRkdmVXdDg3bEx2SEpiMGE1c2dvakdkU0tQOTBucHc5ajRpcGpod2ViL3B2ME9DRXc4SUFtSG56YU9MVTdEVi9rYW8reXk0TDYiLCJtYWMiOiI5MmU2NWEyZDhkOGFiNTdkYzQ0ZGJhMDQwNzFhYzFmOGY4MzNjNWU2ODczYWNiOTVlNjU4MWUyZWVmMzE5NjNmIiwidGFnIjoiIn0%3D; _ga_HEG073JLWK=GS1.1.1670663205.2.1.1670663540.0.0.0',
'pragma': 'no-cache',
'referer': 'https://www.bikebd.com/bike-price-in-bd',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
def start_requests(self):
urls = ["https://www.bikebd.com/brand/yamaha"]
for url in urls:
yield scrapy.Request(url= url , callback=self.parse)
def parse(self, response):
container = response.xpath("//div[#class='tab-cards-rev']/div/div[#class='col-md-3']")
for item in container:
title = item.xpath(".//h2[#class='ltn-tittle mb-0']/strong/text()").get()
yield {'title' : title}
but when I crawl the script it returns none. Then I start debug the spider with open in browser by this line of code...
def parse(self, response):
open_in_browser(response)
Then it showed me some unreadable characters like below.
ôÿ‘šôC##tøœ÷ÿfjun_N?ÈKÛžDÜ$YVy˲UÒ–³TUoŸÄ'…8hI®ö,ëá4·9g¶åÝûtÎOûUéCh|⢀Ð8`÷ D“†b&³“ݪW¯ª~÷À"Á¹¹a]Ðøß¿{©wŽ(€è ¼ÇX¶nû»¥ˆŠ¦eÙËÿ«Íñ"åY³1Vÿõ¯³½ÍUDñ‡½â`¹‰½é½ê”§Œl‡%,{Š»È?8PaÐ-œ[·EÏ&Žl,ö‰êµŽÄ€ŠŒ+ŒMØèãG{L˜ž2 ?£?èa´UWÞ$[0²üÃZ’‡N±ÅÔ%$[pÝ9ä[ ¯±ÖÞW(ñ¥-ˆxf¿ì±
What's going on the site? need some help.

Playwright/Chromium: Request stuck pending for localhost:8082

I just installed Playwright to create some automated tests for my web-app.
I got my test running fine on the staging version of my site just fine, but one of my requests is hanging when I try running it against localhost:
I have Nginx running on :8080 and webpack running on :8082 to serve my JS. As you can see, the document ("create") is served from :8080 no problem, but all.js which is http://localhost:8082/assets/all.js never finishes.
What's really confusing me though is that I can load that URL in a new tab in Chrome just fine, I can wget under WSL, and I can curl it under cmd.exe. So there's something funky going on with the networking when the browser instance is created by Playwright, but I don't know how to debug further. The same thing happens if I set defaultBrowserType: 'firefox'.
What else can I try?
I just found chrome://net-export/ and enabled it during the request. I've got all the CLI flags now:
"clientInfo": {
"cl": "b9c217c128c16f53d12f9a02933fcfdec1bf49af-refs/branch-heads/5195#{#176}",
"command_line": "\"C:\\Users\\Mark\\AppData\\Local\\ms-playwright\\chromium-1019\\chrome-win\\chrome.exe\" --disable-field-trial-config --disable-background-networking --enable-features=NetworkService,NetworkServiceInProcess --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --disable-sync --force-color-profile=srgb --metrics-recording-only --no-first-run --enable-automation --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --no-sandbox --auto-open-devtools-for-tabs --deny-permission-prompts --allow-loopback-in-peer-connection --user-data-dir=\"C:\\Users\\Mark\\AppData\\Local\\Temp\\playwright_chromiumdev_profile-5Cth57\" --remote-debugging-pipe --no-startup-window --flag-switches-begin --flag-switches-end --file-url-path-alias=\"/gen=C:\\Users\\Mark\\AppData\\Local\\ms-playwright\\chromium-1019\\chrome-win\\gen\"",
"name": "Chromium",
"official": "unofficial",
"os_type": "Windows NT: 10.0.19044 (x86_64)",
"version": "105.0.5195.19",
"version_mod": ""
},
And a few request details:
{
"params": {
"headers": [
"Host: localhost:8082",
"Connection: keep-alive",
"sec-ch-ua: \"Chromium\";v=\"105\", \"Not)A;Brand\";v=\"8\"",
"Origin: http://localhost:8080",
"Accept-Language: en-CA",
"sec-ch-ua-mobile: ?0",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
"sec-ch-ua-platform: \"Windows\"",
"Accept: */*",
"Sec-Fetch-Site: same-site",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Dest: script",
"Referer: http://localhost:8080/",
"Accept-Encoding: gzip, deflate, br"
],
"line": "GET /assets/all.js HTTP/1.1\r\n"
},
"phase": 0,
"source": {
"id": 170,
"start_time": "131696227",
"type": 1
},
"time": "131696228",
"type": 169
},
Nothing really jumps out at me as suspicious though.

How do I scrape a website that ignores my headers?

test_url = 'https://crimegrade.org/safest-places-in-60629/'
test_headers = {
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': '_ga=GA1.2.1384046872.1654177894; _gid=GA1.2.924008640.1654177894',
'pragma': 'no-cache',
'referer' : 'https://crimegrade.org/crime-by-zip-code/',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests' : '1',
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'
}
crime_response = requests.get(test_url, headers=test_headers)
print(crime_response.content)
I've managed to scrape other websites with a similar approach before, but I haven't been able to get parameters or a clean 200 status code for crimegrade.org. I think that's why I'm getting this response:
\<div class="cf-alert cf-alert-error cf-cookie-error" id="cookie-alert" data-translate="enable_cookies">Please enable cookies.\</div>
Do you have any advice on how to solve this?
Through a bit more reading, watching, & hunting on my end, I managed to get around this with a very conventional method of automating my browsing with Selenium. My code is below.
Note: .page_source gives the html data which can be parsed with BeautifulSoup. It is akin to the .content yield in my original post, except it's the information I need.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
crime_url = 'https://crimegrade.org/safest-places-in-73505/'
chrome_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
chrome_driver.get(crime_url)
crime_html = chrome_driver.page_source
chrome_driver.quit()

scrapy returns response.status 505

scrapy when trying to open the site returns response.status 505
505 HTTP Version Not Supported
The same site opens normally in the browser. Why might this be? How can this be fixed?
I call scrapy in console by this command line:
scrapy shell 'https://xiaohua.zol.com.cn/detail60/59411.html'
You should use proper headers to extract the data. here is a demo with output
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class Xiaohua(scrapy.Spider):
name = 'xiaohua'
start_urls = 'https://xiaohua.zol.com.cn/detail60/59411.html'
def start_requests(self):
headers = {
'authority': 'xiaohua.zol.com.cn',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'z_pro_city=s_provice%3Dmengjiala%26s_city%3Dnull; userProvinceId=1; userCityId=0; userCountyId=0; userLocationId=1; ip_ck=7sWD7/jzj7QuOTIyODI0LjE2MzQxMTQxNzg%3D; lv=1634114179; vn=1; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1634114179; _ga=GA1.3.116086394.1634114186; _gid=GA1.3.2021660129.1634114186; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1634114447; questionnaire_pv=1634083202; z_day=ixgo20%3D1%26icnmo11564%3D1; 22aa20c0da0b6f1d9a3155e8bf4c364e=cq11lgg54n27u10p%7B%7BZ%7D%7D%7B%7BZ%7D%7Dnull; MyZClick_22aa20c0da0b6f1d9a3155e8bf4c364e=/html/body/div%5B5%5D/div/div/div%5B2%5D/p/a/',
}
yield scrapy.Request(url= self.start_urls , callback=self.parse, headers=headers)
def parse(self, response):
print(response.status)
print('*'*10)
print(response.css('h1.article-title::text').get())
print(response.css('ul.nav > li > a::text').getall())
print('*'*10)
process = CrawlerProcess()
process.crawl(Xiaohua)
process.start()
output
200
**********
导演你能认真点儿吗
['笑话首页', '最新笑话', '冷笑话', '搞笑趣图', '搞笑视频', '上传笑话']
**********

Sending post requests with Scrapy

I'm learning how to do web scraping with Scrapy and I'm having problems with scraping dynamically loaded content. I'm trying to scrape a phone number from a website which sends a POST request in order to obtain the number:
This is the header of the Post request it sends:
Host: www.mymarket.ge
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://www.mymarket.ge/en/pr/16399126/savaWro-inventari/fulis-yuTi
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
X-Requested-With: XMLHttpRequest
Content-Length: 13
Origin: https://www.mymarket.ge
Connection: keep-alive
Cookie: Lang=en; split_test_version=v1; CookieID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJEYXRhIjp7IklEIjozOTUwMDY2MzUsImN0IjoxNTkyMzA2NDMxfSwiVG9rZW5JRCI6Ik55empxVStDa21QT1hKaU9lWE56emRzNHNSNWtcL1wvaVVUYjh2dExCT3ZKWT0iLCJJc3N1ZWRBdCI6MTU5MjMyMTc1MiwiRXhwaXJlc0F0IjoxNTkyMzIyMDUyfQ.mYR-I_51WLQbzWi-EH35s30soqoSDNIoOyXgGQ4Eu84; ka=da; SHOW_BETA_POPUP=B; APP_VERSION=B; LastSearch=%7B%22CatID%22%3A%22515%22%7D; PHPSESSID=eihhfcv85liiu3kt55nr9fhu5b; PopUpLog=%7B%22%2A%22%3A%222020-05-07+15%3A13%3A29%22%7D
and this is the body:
PrID=16399126
I successfully managed to replicate the post request on reqbin.com, but can't figure out how to do it with Scrapy. This is what my code looks like:
class MymarketcrawlerSpider(CrawlSpider):
name = "mymarketcrawler"
allowed_domains = ["mymarket.ge"]
start_urls = ["http://mymarket.ge/"]
rules = (
Rule(
LinkExtractor(allow=r".*mymarket.ge/ka/*", restrict_css=".product-card"),
callback="parse_item",
follow=True,
),
)
def parse_item(self, response):
item_loader = ItemLoader(item=MymarketItem(), response=response)
def parse_num(response):
try:
response_text = response.text
response_dict = ast.literal_eval(response_text)
number = response_dict['Data']['Data']['numberToShow']
nonlocal item_loader
item_loader.add_value("number", number)
yield item_loader.load_item()
except Exception as e:
raise CloseSpider(e)
yield FormRequest.from_response(
response,
url=r"https://www.mymarket.ge/ka/pr/ShowFullNumber/",
headers={
"Host": "www.mymarket.ge",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.mymarket.ge/ka/pr/16399126/savaWro-inventari/fulis-yuTi",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
},
formdata={"PrID": "16399126"},
method="POST",
dont_filter=True,
callback=parse_num
)
item_loader.add_xpath(
"seller", "//div[#class='d-flex user-profile']/div/span/text()"
)
item_loader.add_xpath(
"product",
"//div[contains(#class, 'container product')]//h1[contains(#class, 'product-title')]/text()",
)
item_loader.add_xpath(
"price",
"//div[contains(#class, 'container product')]//span[contains(#class, 'product-price')][1]/text()",
TakeFirst(),
)
item_loader.add_xpath(
"images",
"//div[#class='position-sticky']/ul[#id='imageGallery']/li/#data-src",
)
item_loader.add_xpath(
"condition", "//div[contains(#class, 'condition-label')]/text()"
)
item_loader.add_xpath(
"city",
"//div[#class='d-flex font-14 font-weight-medium location-views']/span[contains(#class, 'location')]/text()",
)
item_loader.add_xpath(
"number_of_views",
"//div[#class='d-flex font-14 font-weight-medium location-views']/span[contains(#class, 'svg-18')]/span/text()",
)
item_loader.add_xpath(
"publish_date",
"//div[#class='d-flex left-side']//div[contains(#class, 'font-12')]/span[2]/text()",
)
item_loader.add_xpath(
"total_products_amount",
"//div[contains(#class, 'user-profile')]/div/a/text()",
re=r"\d+",
)
item_loader.add_xpath(
"description", "//div[contains(#class, 'texts full')]/p/text()"
)
item_loader.add_value("url", response.url)
yield item_loader.load_item()
The code above doesn't work; The number field is not populated.
I can print out the number to the screen, but unable to save it to the csv file. The number column in the csv file is blank, it doesn't contain any values.
Scrapy works asynchronously and every link to crawl, every item to process, etc. is put inside a queue. That is why you yield a request and wait for a SpiderDownloader, ItemPipeline, etc. to process your request.
What is happening is that you have requests that are processed seperately and that is why you don't see your results. Personally I would parse the results from the first request, save them in the 'meta' data and pass them to the next request, so that the data is available afterwards.
E.g.
class MymarketcrawlerSpider(CrawlSpider):
name = "mymarketcrawler"
allowed_domains = ["mymarket.ge"]
start_urls = ["http://mymarket.ge/"]
rules = (
Rule(
LinkExtractor(allow=r".*mymarket.ge/ka/*", restrict_css=".product-card"),
callback="parse_item",
follow=True,
),
)
def parse_item(self, response):
def parse_num(response):
item_loader = ItemLoader(item=MymarketItem(), response=response)
try:
response_text = response.text
response_dict = ast.literal_eval(response_text)
number = response_dict['Data']['Data']['numberToShow']
# New part:
product = response.meta['product']
# You won't need this now: nonlocal item_loader
# Also new:
item_loader.add_value("number", number)
item_loader.add_value("product", product)
yield item_loader.load_item()
except Exception as e:
raise CloseSpider(e)
# Rewrite your parsers like this:
product = response.xpath(
"//div[contains(#class, 'container product')]//h1[contains(#class, 'product-title')]/text()"
).get()
yield FormRequest.from_response(
response,
url=r"https://www.mymarket.ge/ka/pr/ShowFullNumber/",
headers={
"Host": "www.mymarket.ge",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.mymarket.ge/ka/pr/16399126/savaWro-inventari/fulis-yuTi",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
},
formdata={"PrID": "16399126"},
method="POST",
dont_filter=True,
callback=parse_num,
meta={"product": product}
)

Resources