Web scraping: making a POST request with cookies - web-scraping

I would like to retrieve the timetable for this [booking site][1]. I need to retrieve/refresh the cookie before making the POST request to their timetable JSON file, otherwise I get a session ID error.
`sessionID: none` and 'errorCode': '620', 'errorDescription': 'Invalid Session Number'
This is the request I make:
url = 'https://alilauro-tickets.certusonline.com/php/proxy.php'
s = requests.session()
#Request the timetable website
s.get('https://alilauro-tickets.certusonline.com/')
s.headers.update({'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.3'})
data = {
'msg': 'TimeTable',
'req': '{"getAvailability":"Y","getBasicPrice":"Y","getRouteAnalysis":"Y","directOnly":"Y","legs":1,"pax":1,"origin":"BEV","destination":"FOR","tripRequest":[{"tripfrom":"BEV","tripto":"FOR","tripdate":"2020-03-18","tripleg":0}]}'
}
#Request the JSON with timetable data
r = s.post(url, data=data, timeout=20, cookies=s.cookies)
This is the full response I get:
{'SWS_LoginInfo': {'agencyCode': None, 'userCode': None, 'password': None, 'language': 'EN', 'sessionId': None, 'payByCreditCard': None, 'hasCreditLimit': None, 'creditLimit': None, 'ticketCount': None, 'errorCode': '620', 'errorDescription': 'Invalid Session Number'}, 'SWS_TripInfo': {'getAvailability': None, 'getPrices': None, 'getRouteAnalysis': None, 'getRequiredFields': None}, 'VWS_Trips_Trip': [], 'SWS_Parameters': {'VCompanies_companyDetails': [], 'VPorts_portDetails': [], 'VCountries_countryDetails': [], 'VVessels_vesselDetails': [], 'VPassengerTypes_passengerType': [], 'VPassengerClasses_passengerClass': [], 'VPassengerDiscounts_passengerDiscount': [], 'VVehicleTypes_vehicleType': [], 'VVehicleDiscounts_vehicleDiscount': [], 'VServiceTypes_serviceType': [], 'VServiceDiscounts_serviceDiscount': [], 'VPortCombinations_portCombination': [], 'VDeliveryMethods_deliveryMethod': [], 'VDocumentTypes_documentType': [], 'VLoyaltyCardTypes_loyaltyCardType': []}, 'SWS_PriceTotals': {'totalNetFare': None, 'totalTaxes': None, 'totalVat': None, 'totalPrice': None, 'totalFees': None, 'totalFeesTax': None, 'totalPayable': None}, 'SWS_Reservation': {'salesChannel': None, 'bookingReference': None, 'companyReference': None, 'acceptFees': None, 'reservationStatus': None, 'optionDateTime': None, 'issuePrepaid': None, 'leaderFullName': None, 'leaderEmail': None, 'leaderPhone': None, 'totalNetFare': None, 'totalTaxes': None, 'totalVat': None, 'totalPrice': None, 'totalFees': None, 'totalFeesTax': None, 'totalPayable': None, 'refundAmount': None, 'acceptTerms': None, 'deliveryMethod': None, 'deliveryAmount': None, 'deliveryAddress': None, 'deliveryCountry': None, 'zipCode': None, 'acceptShareData': None, 'settled': None}, 'VWS_CancelledTickets_Ticket': [], 'VWS_Tickets_Ticket': [], 'ScardMember': {'id': None, 'surname': None, 'firstname': None, 'languageId': None, 'language': None, 'gender': None, 'documentTypeId': None, 'documentTypeCode': None, 'documentType': None, 'documentNumber': None, 'nationalityId': None, 'nationality': None, 'dateRegistered': None, 'active': None, 'mobile': None, 'phone': None, 'fax': None, 'email': None, 'address': None, 'zipCode': None, 'countryId': None, 'country': None, 'birthDate': None, 'birthPlace': None, 'VCards_loyaltyCard': []}, 'SloyaltyCard': {'id': None, 'loyaltyCardTypeId': None, 'loyaltyCardTypeCode': None, 'loyaltyCardType': None, 'cardNumber': None, 'active': None, 'points': None, 'dateFrom': None, 'dateTo': None}, 'VcardTransactions_cardTransaction': []}

You simply need to access/store your cookies of the current session by calling s.cookies.
Then you may use it in the subsequent requests.

Related

Scrapy parsed unknown character

I have wanted to scrape the site https://www.bikebd.com/brand/yamaha/ . here is my script
import scrapy
from scrapy.utils.response import open_in_browser
from urllib.parse import urlencode
class BikebdSpider(scrapy.Spider):
name = 'bikebd'
allowed_domains = ['www.bikebd.com']
headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,da;q=0.8',
'cache-control':'no-cache',
'cookie': '_ga=GA1.1.1549289426.1669609851; XSRF-TOKEN=eyJpdiI6IjhCN1BnV0RoK3dOQnFEQlhYRUZVZEE9PSIsInZhbHVlIjoiTFQ4Ym15MWhoU1hmR3FxaWdVYnkvbnovMTVDbS9iRm1OVCsrV0F2RzA5dHlmMWpObENoSFBWY0VXclBNWkZaNlV1aitwSXBWNjhNMGs2Z3JqQ3ZvQWVIQ25QcnNOZkNpR3lwMGNkL01aWHM3VDZ5YmZJblRha0kyUk5IMTh2UzQiLCJtYWMiOiJjMzFmMDZlZDFjNzVhNTVlZjY1MWEzNWJkZjY5Y2Q1MjFiZmNmM2UxOWRiZWJlMGRhZWY5OGU0MGQ4OWI5N2ViIiwidGFnIjoiIn0%3D; bikebd_session=eyJpdiI6ImVVb2NqcmFLR2dKSXc2NnNqUlV6ZWc9PSIsInZhbHVlIjoibUVNcEZidUxsbWdkK3c2UDFYdDYwcHFOdVU1WmVXY0ZiV1pHRzJBbzlaUDNuWGl2Vk1OTk5QYnRkdmVXdDg3bEx2SEpiMGE1c2dvakdkU0tQOTBucHc5ajRpcGpod2ViL3B2ME9DRXc4SUFtSG56YU9MVTdEVi9rYW8reXk0TDYiLCJtYWMiOiI5MmU2NWEyZDhkOGFiNTdkYzQ0ZGJhMDQwNzFhYzFmOGY4MzNjNWU2ODczYWNiOTVlNjU4MWUyZWVmMzE5NjNmIiwidGFnIjoiIn0%3D; _ga_HEG073JLWK=GS1.1.1670663205.2.1.1670663540.0.0.0',
'pragma': 'no-cache',
'referer': 'https://www.bikebd.com/bike-price-in-bd',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
def start_requests(self):
urls = ["https://www.bikebd.com/brand/yamaha"]
for url in urls:
yield scrapy.Request(url= url , callback=self.parse)
def parse(self, response):
container = response.xpath("//div[#class='tab-cards-rev']/div/div[#class='col-md-3']")
for item in container:
title = item.xpath(".//h2[#class='ltn-tittle mb-0']/strong/text()").get()
yield {'title' : title}
but when I crawl the script it returns none. Then I start debug the spider with open in browser by this line of code...
def parse(self, response):
open_in_browser(response)
Then it showed me some unreadable characters like below.
ôÿ‘šôC##tøœ÷ÿfjun_N?ÈKÛžDÜ$YVy˲UÒ–³TUoŸÄ'…8hI®ö,ëá4·9g¶åÝûtÎOûUéCh|⢀Ð8`÷ D“†b&³“ݪW¯ª~÷À"Á¹¹a]Ðøß¿{©wŽ(€è ¼ÇX¶nû»¥ˆŠ¦eÙËÿ«Íñ"åY³1Vÿõ¯³½ÍUDñ‡½â`¹‰½é½ê”§Œl‡%,{Š»È?8PaÐ-œ[·EÏ&Žl,ö‰êµŽÄ€ŠŒ+ŒMØèãG{L˜ž2 ?£?èa´UWÞ$[0²üÃZ’‡N±ÅÔ%$[pÝ9ä[ ¯±ÖÞW(ñ¥-ˆxf¿ì±
What's going on the site? need some help.

Scrapy via API requests: [Product Catalogue page > Product Page] > Pagination

I am trying to scrape product details from the product page using API requests. I have no issues accessing the product catalogue page and getting the request urls for each of the products. But, I am facing some problem in parsing them correctly from one function to another.
I think I am missing a few lines of codes, or incorrect use of self.parse. If i send in a new request (for each product page), should I send in new header requests as well? Because the product page has different request headers than the one in product catalogue page. How do I do that?
Thank you so much for your feedbacks and help! Much appreciated.
This is my work so far: https://pastebin.com/H1yyDiDL
import scrapy
from scrapy.exceptions import CloseSpider
import json
class HtmshopeeSpider(scrapy.Spider):
name = 'shopeeitem2'
headers={
'authority': 'shopee.com.my',
'method': 'GET',
'path': '/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=0&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': 'private_content_version=75d921dc5d1fc85c97d8d9876d6e58b2; _fbp=fb.2.1626162049790.1893904607; _ga=GA1.3.518387377.1626162051; _gid=GA1.3.151467354.1626162051; _gcl_au=1.1.203553443.1626162051; x_axis_main=v_id:017a9ecfb7ba000a4be21b24a20803079001c0710093c$_sn:1$_ss:1$_pn:1%3Bexp-session$_st:1626163851002$ses_id:1626162051002%3Bexp-session',
'if-none-match-': '55b03-676eb00af72df9e2b38a2976dd41d5ea',
'pragma': 'no-cache',
'referer': 'https://shopee.com.my/search?keyword=chantiva&page=0',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en'
}
def start_requests(self):
yield scrapy.Request(
url= 'https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=0&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2',
headers=self.headers,
callback=self.parse_products,
meta={
'newest':0
}
)
def parse_products(self, response):
json_resp = json.loads(response.body)
products = json_resp.get('items')
for product in products:
item_id = product.get('item_basic').get('itemid'),
shop_id = product.get('item_basic').get('shopid')
yield scrapy.Request(
url=f"https://shopee.com.my/api/v2/item/get?itemid={item_id}&shopid={shop_id}",
callback=self.parse_data,
headers=self.headers
)
def parse_data(self, response):
json_resp = json.loads(response.body)
datas = json_resp.get('item')
for data in datas:
yield {
'product': data.get('name')
}
count= 240000
next_page = response.meta['newest'] + 60
if next_page <= count:
yield scrapy.Request(
url=f"https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest={next_page}&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2",
headers=self.headers,
meta={'newest': next_page}
)
Here is the solution. Actualy, the url contains total count 123 and per page count 60
CODE:
import scrapy
from scrapy.exceptions import CloseSpider
import json
class HtmshopeeSpider(scrapy.Spider):
name = 'shopeeitem2'
headers={
'authority': 'shopee.com.my',
'method': 'GET',
'path': '/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=0&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': 'private_content_version=75d921dc5d1fc85c97d8d9876d6e58b2; _fbp=fb.2.1626162049790.1893904607; _ga=GA1.3.518387377.1626162051; _gid=GA1.3.151467354.1626162051; _gcl_au=1.1.203553443.1626162051; x_axis_main=v_id:017a9ecfb7ba000a4be21b24a20803079001c0710093c$_sn:1$_ss:1$_pn:1%3Bexp-session$_st:1626163851002$ses_id:1626162051002%3Bexp-session',
'if-none-match-': '55b03-676eb00af72df9e2b38a2976dd41d5ea',
'pragma': 'no-cache',
'referer': 'https://shopee.com.my/search?keyword=chantiva&page=0',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en'
}
def start_requests(self):
yield scrapy.Request(
url= 'https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=0&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2',
headers=self.headers,
callback=self.parse_products,
meta={
'newest':0
}
)
def parse_products(self, response):
json_resp = json.loads(response.body)
products = json_resp.get('items')
for product in products:
yield{
'Name':product.get('item_basic').get('name'),
'Price':product.get('item_basic').get('price')
}
count = json_resp.get('total_count')
next_page = response.meta['newest'] + 60
if next_page <= count:
yield scrapy.Request(
url=f'https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest={next_page}&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2',
callback=self.parse_products,
headers=self.headers,
meta={'newest': next_page}
)
OUTPUT: A portion of total output.
{'Name': 'Chantiva Haruan Tablet SS Plus 450mg (60 Tabs) Cepat sembuh luka', 'Price': 9000000}
2021-08-10 12:40:30 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=0&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'CHANTIVA 750MG 30 TABLETS (EXP:04/23)', 'Price': 8490000}
2021-08-10 12:40:30 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=0&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'CHANTIVA TABLET HARUAN SS PLUS 450MG (EXP: 03/2022)', 'Price': 1389000}
{'Name': 'CHANTIVA HARUAN SS PLUS TAB 60S', 'Price': 7550000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': "CHANTIVA 450MG 1 STRIP 10'S (IKAN HARUAN)", 'Price': 2000000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'CHANTIVA TABLET 750MG (EXP 04/23)', 'Price': 3800000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': "TrueLifeSciences® CHANTIVA Haruan SS Plus 450mg Tablet 60's", 'Price': 8460000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'Chantiva 450mg Tablet', 'Price': 9400000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'Chantiva Tablet Haruan SS Plus 450mg 60s', 'Price': 8565000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'Chantiva Skin Fix Cream 20g x2 (Twin Pack)', 'Price': 5380000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': "CHANTIVA 450MG TABLET 60'S", 'Price': 7690000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'CHANTIVA TABLET HARUAN (450MG/750MG)', 'Price': 2000000}
2021-08-10 12:40:31 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=60&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>
{'Name': 'CHANTIVA 750MG 30 TABLETS (EXP: 09/2022)', 'Price': 8490000}
2021-08-10 12:40:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=120&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2> (referer: https://shopee.com.my/search?keyword=chantiva&page=0)
2021-08-10 12:40:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=120&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>{'Name': 'CHANTIVA TABLET IKAN HARUAN 450MG SAKIT LUTUT SAKIT URAT LUKA 60"S', 'Price': 7490000}
2021-08-10 12:40:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=120&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>{'Name': "[CLEARANCE][🎁WITH FREE GIFT🎁] CHANTIVA TABLET HARUAN SS PLUS 60'S (EXP:02/2021)", 'Price': 7600000}
2021-08-10 12:40:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://shopee.com.my/api/v4/search/search_items?by=relevancy&keyword=chantiva&limit=60&newest=120&order=desc&page_type=search&scenario=PAGE_GLOBAL_SEARCH&version=2>{'Name': "CHANTIVA 450MG TABLET 6X10'S by strip Exp:10/21", 'Price': 990000}
2021-08-10 12:40:32 [scrapy.core.engine] INFO: Closing spider (finished)
2021-08-10 12:40:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3242,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 40725,
'downloader/response_count': 3,
'downloader/response_status_count/200': 3,
'elapsed_time_seconds': 4.219452,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 8, 10, 6, 40, 32, 976939),
'httpcompression/response_bytes': 377162,
'httpcompression/response_count': 3,
'item_scraped_count': 123,

Scrapy runs but doesn't crawl site - Scrapy Shell response in loop

I've tried to set up Scrapy to crawl a database for technical norm and standards.
What is the problem:
I wrote a Scraper, got a 200 response but no results - it scraped 0 pages:
2020-09-06 12:42:00 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: stack)
2020-09-06 12:42:00 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.9.0, Python 3.7.2 (tags/v3.7.2:9a3ffc0492, Dec 23 2018, 22:20:52) [MSC v.1916 32 bit (Intel)], pyOpenSSL 18.0.0 (OpenSSL 1.1.0j 20 Nov 2018), cryptography 2.4.2, Platform Windows-10-10.0.18362-SP0
2020-09-06 12:42:00 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'stack', 'NEWSPIDER_MODULE': 'stack.spiders', 'SPIDER_MODULES': ['stack.spiders'], 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
2020-09-06 12:42:00 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-06 12:42:01 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-06 12:42:01 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-06 12:42:01 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-09-06 12:42:01 [scrapy.core.engine] INFO: Spider opened
2020-09-06 12:42:01 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-06 12:42:01 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2020-09-06 12:42:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.beuth.de/de/regelwerke/vdi/vdi-richtlinien-entwuerfe> (referer: None)
2020-09-06 12:42:01 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-06 12:42:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 341,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 6149,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 6149,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 6, 10, 42, 1, 684021),
'log_count/DEBUG': 2,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 6, 10, 42, 1, 140686)}
2020-09-06 12:42:01 [scrapy.core.engine] INFO: Spider closed (finished)
Here are my items out of the items.py file which I want to scrape:
from scrapy.item import Item, Field
class StackItem(Item):
title = Field()
url = Field()
date = Field()
price = Field()
subtitle = Field()
description = Field()
This my crawler code:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import StackItem
class StackSpider(Spider):
name = "stack"
allowed_domains = ["www.beuth.de"]
start_urls = [
"https://www.beuth.de/de/regelwerke/vdi/vdi-richtlinien-entwuerfe",
]
def parse(self, response):
elements = Selector(response).xpath('//div[#class="bwr-card__inner"]')
for element in elements:
item = StackItem()
item['title'] = element.xpath('a[#class="bwr-link__label"]/text()').extract()[0]
item['url'] = element.xpath('a[#class="bwr-card__title-link"]/#href').extract()[0]
item['date'] = element.xpath('div[#class="bwr-type__item bwr-type__item--light"]/text()').extract()[0]
item['price'] = element.xpath('div[#class="bwr-buybox__price-emph]/text()').extract()[0]
item['subtitle'] = element.xpath('div[#class="bwr-card__subtitle bwr-data-dlink"]/text()').extract()[0]
item['description'] = element.xpath('div[#class="bwr-card__text bwr-rte bwr-data-dlink"]/text()').extract()[0]
yield item
What I tried so solve the problem:
I tried the Scrapy-Shell. After I used a defined User-Agent the website didn't block me:
In [2]: from scrapy import Request
...: req = Request("https://www.beuth.de/de/regelwerke/vdi/vdi-richtlinien-entwuerfe",
headers={"USER-AGENT" : "Mozi
...: lla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.
...: 2552.888"})
...: fetch(req)
2020-09-06 12:48:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.beuth.de/de/regelwerke/vdi/vdi-richtlinien-entwuerfe>
(referer: None)
After that I tested the selectors (see items list above) but unfortunately I just get a [] whatever XPath I use:
In [3]: response.xpath("//div[#class='bwr-buybox__price']/a/text").getall()
Out[3]: []
When I try view(response) I just a Browser Window with an infinite loading loop.
I tried to analyse the outcome but without an error I wasnt sure, where to start fixing.
I defined a User Agent in the settings.py because otherwise I got an error message about blocking (I think the website dont allow crawls).
Summary:
I want to scrape the Items above from the list. But I am not sure if there is just a problem with my selectors because testing in Shell results in a [ ] everytime.

Can't upload pdf to slack “no_file_data” with Elixir and Hackney using without using :file

I'm writing a slack bot in Elixir that has to upload a file to a channel.
I have figured out how to do it using the :file option from :hackney.send_multipart_body but I haven't been able to send data directly, using either the :data or :part options, I keep getting %{"error" => "no_file_data", "ok" => false}.
I don't know much about multipart/form-data and I'm not sure if the problem comes from the data being sent or from the way it's being sent.
I used socat to dump my requests.
Solution with :file
{:ok, ref} =
:hackney.request(
:post,
# "http://localhost:6143?" <>
"https://slack.com/api/files.upload?" <>
Plug.Conn.Query.encode(%{
"channels" => "AAAAAA",
"filename" => "mix.exs",
"filetype" => "elixir"
}),
[
{"Authorization", "Bearer #{#token}"}
],
:stream_multipart,
[]
)
:hackney.send_multipart_body(
ref,
{:file, "./mix.exs"}
)
{:ok, _status, _headers, ref} = :hackney.start_response(ref)
{:ok, body} = :hackney.body(ref)
body |> Poison.decode!()
POST ?channels=AAAAAAAA&filename=mix.exs&filetype=elixir HTTP/1.1
Authorization: Bearer xoxb-XXXXXX
Host: localhost:6143
User-Agent: hackney/1.16.0
Content-Type: multipart/form-data; boundary=---------------------------tmktxwwsruoirfey
Transfer-Encoding: chunked
af
-----------------------------tmktxwwsruoirfey
content-length: 646
content-type: application/octet-stream
content-disposition: form-data; name="file"; filename="mix.exs"
286
defmodule TestAppSlackCowboy.MixProject do
use Mix.Project
def project do
[
app: :test_app_slack_cowboy,
version: "0.1.0",
elixir: "~> 1.10",
start_permanent: Mix.env() == :prod,
deps: deps()
]
end
# Run "mix help compile.app" to learn about applications.
def application do
[
extra_applications: [:logger, :plug_cowboy],
mod: {TestAppSlackCowboy.Application, []}
]
end
# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:plug, "~> 1.10"},
{:plug_cowboy, "~> 2.0"},
{:poison, "~> 3.1"},
{:hackney, "~> 1.15"}
]
end
end
2
31
-----------------------------tmktxwwsruoirfey--
0
Solution with :data
{:ok, content} = File.read("mix.exs")
{:ok, ref} =
:hackney.request(
:post,
# "http://localhost:6143?" <>
"https://slack.com/api/files.upload?" <>
Plug.Conn.Query.encode(%{
"channels" => "AAAAAA",
"filename" => "mix.exs",
"filetype" => "elixir"
}),
[
{"Authorization", "Bearer #{#token}"}
],
:stream_multipart,
[]
)
:hackney.send_multipart_body(
ref,
{:data, "file", content}
)
{:ok, _status, _headers, ref} = :hackney.start_response(ref)
{:ok, body} = :hackney.body(ref)
body |> Poison.decode!()
The delimitation of parts is different here.
POST ?channels=AAAAA&filename=mix.exs&filetype=elixir HTTP/1.1
Authorization: Bearer xoxb-XXXXXX
Host: localhost:6143
User-Agent: hackney/1.16.0
Content-Type: multipart/form-data; boundary=---------------------------liugtwbcxxsgwiyg
Transfer-Encoding: chunked
321
-----------------------------liugtwbcxxsgwiyg
content-length: 4
content-type: application/octet-stream
content-disposition: form-data; name="file"
defmodule TestAppSlackCowboy.MixProject do
use Mix.Project
def project do
[
app: :test_app_slack_cowboy,
version: "0.1.0",
elixir: "~> 1.10",
start_permanent: Mix.env() == :prod,
deps: deps()
]
end
# Run "mix help compile.app" to learn about applications.
def application do
[
extra_applications: [:logger, :plug_cowboy],
mod: {TestAppSlackCowboy.Application, []}
]
end
# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:plug, "~> 1.10"},
{:plug_cowboy, "~> 2.0"},
{:poison, "~> 3.1"},
{:hackney, "~> 1.15"}
]
end
end
31
-----------------------------liugtwbcxxsgwiyg--
0
Solution with :part
The only difference with :file here is the absence of filename="mix.exs" in the part header.
{:ok, content} = File.read("mix.exs")
{:ok, ref} =
:hackney.request(
:post,
# "http://localhost:6143?" <>
"https://slack.com/api/files.upload?" <>
Plug.Conn.Query.encode(%{
"channels" => "AAAAAA",
"filename" => "mix.exs",
"filetype" => "elixir"
}),
[
{"Authorization", "Bearer #{#token}"}
],
:stream_multipart,
[]
)
:hackney.send_multipart_body(
ref,
{:part, "file", String.length(content)}
)
:hackney.send_multipart_body(
ref,
{:part_bin, content}
)
:hackney.send_multipart_body(
ref,
{:part, :eof}
)
{:ok, _status, _headers, ref} = :hackney.start_response(ref)
{:ok, body} = :hackney.body(ref)
body |> Poison.decode!()
The only difference with :file here is the absence of filename="mix.exs" in the part header.
POST ?channels=AAAAAA&filename=mix.exs&filetype=elixir HTTP/1.1
Authorization: Bearer xoxb-XXXXXX
Host: localhost:6143
User-Agent: hackney/1.16.0
Content-Type: multipart/form-data; boundary=---------------------------rqimofvzwjzelkol
Transfer-Encoding: chunked
9b
-----------------------------rqimofvzwjzelkol
content-length: 646
content-type: application/octet-stream
content-disposition: form-data; name="file"
286
defmodule TestAppSlackCowboy.MixProject do
use Mix.Project
def project do
[
app: :test_app_slack_cowboy,
version: "0.1.0",
elixir: "~> 1.10",
start_permanent: Mix.env() == :prod,
deps: deps()
]
end
# Run "mix help compile.app" to learn about applications.
def application do
[
extra_applications: [:logger, :plug_cowboy],
mod: {TestAppSlackCowboy.Application, []}
]
end
# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:plug, "~> 1.10"},
{:plug_cowboy, "~> 2.0"},
{:poison, "~> 3.1"},
{:hackney, "~> 1.15"}
]
end
end
2
31
-----------------------------rqimofvzwjzelkol--
0
Versions:
Elixir: 1.10.2
OTP: 22.0.7
Hackney: 1.15
PLug: 1.10

Error while rendering collective.carousel portlet

I've installed collective carousel 1.5 (note: I was using 1.6 and got the same error, was hoping it was a regression bug or something) on Plone 4.3.1.
I then create a carousel portlet associated with a collection on the site.
When I go to pages that contain the portlet I get the following error:
KeyError: 'carousel-portlet-view'
Here's the traceback:
- URL: /usr/local/share/plone/buildout-cache/eggs/collective.carousel-1.5-py2.7.egg/collective/carousel/po
rtlets/carousel.pt
- Line 44, Column 28
- Expression: <PythonExpr (view.get_tile(item_object))>
- Names:
{'args': (),
'container': <Collection at /mysite/aggregator>,
'context': <Collection at /mysite/aggregator>,
'default': <object object at 0x7fb332672b40>,
'here': <Collection at /mysite/aggregator>,
'loop': {},
'nothing': None,
'options': {},
'repeat': <Products.PageTemplates.Expressions.SafeMapping object at 0x86704c8>,
'request': <HTTPRequest, URL=http://mysite.com/aggregator/folder_summary_view>,
'root': <Application at >,
'template': <Products.Five.browser.pagetemplatefile.ViewPageTemplateFile object at 0x69f4610>,
'traverse_subpath': [],
'user': <PropertiedUser 'admin'>,
'view': <collective.carousel.portlets.carousel.Renderer object at 0x8671d50>,
'views': <Products.Five.browser.pagetemplatefile.ViewMapper object at 0x86742d0>}
Module zope.tales.pythonexpr, line 59, in __call__
- __traceback_info__: (view.get_tile(item_object))
Module <string>, line 1, in <module>
Module collective.carousel.portlets.carousel, line 177, in get_tile
Module OFS.Traversable, line 300, in unrestrictedTraverse
- __traceback_info__: ([], 'carousel-portlet-view')
KeyError: 'carousel-portlet-view'
looking at the views registered for the name carousel-portelt-view in the source code
https://github.com/collective/collective.carousel/blob/master/collective/carousel/browser/configure.zcml one can see that those are all archetypes specific.
given the information of your comment that you're using dexterity i suppose you need to register a view for your dexterity content types.

Resources