I used Portia to create a spider and then downloaded it as scrapy project. The spider runs fine but it says in the logs: Scrapy Crawled 0 pages (at 0 pages/min) and also nothing get's saved. However, it also shows all the pages crawled with 200 response, then shows the bytes of data at the end..
Spider Code
from __future__ import absolute_import
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Identity
from scrapy.spiders import Rule
from ..utils.spiders import BasePortiaSpider
from ..utils.starturls import FeedGenerator, FragmentGenerator
from ..utils.processors import Item, Field, Text, Number, Price, Date, Url, Image, Regex
from ..items import PortiaItem, AllProductsBooksToScrapeSandboxItem
class BooksToscrape(BasePortiaSpider):
name = "books.toscrape.com"
allowed_domains = ['books.toscrape.com']
start_urls = [{'fragments': [{'valid': True,
'type': 'fixed',
'value': 'http://books.toscrape.com/catalogue/page-'},
{'valid': True,
'type': 'range',
'value': '1-50'},
{'valid': True,
'type': 'fixed',
'value': '.html'}],
'type': 'generated',
'url': 'http://books.toscrape.com/catalogue/page-[1-50].html'}]
rules = [
Rule(
LinkExtractor(
allow=(),
deny=('.*')
),
callback='parse_item',
follow=True
)
]
items = [
[
Item(
AllProductsBooksToScrapeSandboxItem, None, '.product_pod', [
Field(
'title', 'h3 > a::attr(title)', []), Field(
'price', '.product_price > .price_color *::text', [])])]]
Pipeline Code
I added openSpider and closeSpider functions to write the items to json lines upon crawling and I think it works because jl file gets created.
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class TesterPipeline(object):
def open_spider(self, spider):
self.file = open('items.jl', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
Settings Code
Enabled pipeline in settings too for pipeline to work.
# -*- coding: utf-8 -*-
# Scrapy settings for Tester project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Tester'
SPIDER_MODULES = ['Tester.spiders']
NEWSPIDER_MODULE = 'Tester.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Tester (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Tester.middlewares.TesterSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Tester.middlewares.TesterDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Tester.pipelines.TesterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
When I run the spider, the following log is created:
(scrape) C:\Users\da74\Desktop\tester>scrapy crawl books.toscrape.com
2018-07-24 12:18:15 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: Tester)
2018-07-24 12:18:15 [scrapy.utils.log] INFO: Versions: lxml 4.2.2.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 11:27:44) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o 27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.17134-SP0
2018-07-24 12:18:15 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'Tester', 'NEWSPIDER_MODULE': 'Tester.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['Tester.spiders']}
2018-07-24 12:18:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2018-07-24 12:18:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-07-24 12:18:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-07-24 12:18:16 [scrapy.middleware] INFO: Enabled item pipelines:
['Tester.pipelines.TesterPipeline']
2018-07-24 12:18:16 [scrapy.core.engine] INFO: Spider opened
2018-07-24 12:18:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-07-24 12:18:16 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-07-24 12:18:16 [scrapy.core.engine] DEBUG: Crawled (404) <GET http://books.toscrape.com/robots.txt> (referer: None)
2018-07-24 12:18:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-1.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-2.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-7.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-4.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-3.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-9.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-5.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-8.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-6.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-10.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-12.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-11.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-14.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-15.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-16.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-17.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-13.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-18.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-19.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-21.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-20.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-22.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-23.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-25.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-24.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-26.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-27.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-32.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-29.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-30.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-33.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-28.html> (referer: None)
2018-07-24 12:18:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-31.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-34.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-35.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-36.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-39.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-40.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-38.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-41.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-37.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-42.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-43.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-44.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-47.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-45.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-46.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-48.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-49.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://books.toscrape.com/catalogue/page-50.html> (referer: None)
2018-07-24 12:18:18 [scrapy.core.engine] INFO: Closing spider (finished)
2018-07-24 12:18:18 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 12168,
'downloader/request_count': 51,
'downloader/request_method_count/GET': 51,
'downloader/response_bytes': 299913,
'downloader/response_count': 51,
'downloader/response_status_count/200': 50,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 7, 24, 4, 18, 18, 598891),
'log_count/DEBUG': 52,
'log_count/INFO': 7,
'response_received_count': 51,
'scheduler/dequeued': 50,
'scheduler/dequeued/memory': 50,
'scheduler/enqueued': 50,
'scheduler/enqueued/memory': 50,
'start_time': datetime.datetime(2018, 7, 24, 4, 18, 16, 208142)}
2018-07-24 12:18:18 [scrapy.core.engine] INFO: Spider closed (finished)
I don't understand why it isn't gathering items. I says first that 0 items crawled and then shows 200 success response for pages..
Please if anyone has any idea what to try to make it crawl will be helpful.
Thankyou
Related
I'm trying to scrape an Amazon product page but scrapy is giving me inconsistent results (sometimes it returns what I want and sometimes it returns None). I have no idea as to why the same code give different results. I created a loop that yield the same request 10 times and it was giving me different results. Can anyone help me?
import scrapy
from scrapy import Request
class AmzsingleSpider(scrapy.Spider):
name = 'amzsingle'
def start_requests(self):
for i in range(10):
yield Request(url="https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929", callback=self.parse, dont_filter=True)
def parse(self, response):
yield {
'title': response.xpath('//span[#id="productTitle"]/text()').get()
}
and this is the log that I get in the terminal. This attempt gave 9 None and 1 found (some other time it was returning 7 None and 3 found):
2021-11-27 22:08:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/robots.txt> (referer: None)
2021-11-27 22:08:30 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:30 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:36 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:38 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:39 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:40 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': '\nĀ”Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n'}
2021-11-27 22:08:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 22:08:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{'title': None}
2021-11-27 22:08:45 [scrapy.core.engine] INFO: Closing spider (finished)
2021-11-27 22:08:45 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4664,
'downloader/request_count': 11,
'downloader/request_method_count/GET': 11,
'downloader/response_bytes': 1508328,
'downloader/response_count': 11,
'downloader/response_status_count/200': 11,
'elapsed_time_seconds': 20.82323,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 11, 27, 15, 8, 45, 324091),
'httpcompression/response_bytes': 7323320,
'httpcompression/response_count': 11,
'item_scraped_count': 10,
'log_count/DEBUG': 22,
'log_count/INFO': 11,
'memusage/max': 53161984,
'memusage/startup': 53161984,
'proxies/good': 1,
'proxies/mean_backoff': 0.0,
'proxies/reanimated': 0,
'proxies/unchecked': 0,
'response_received_count': 11,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 10,
'scheduler/dequeued/memory': 10,
'scheduler/enqueued': 10,
'scheduler/enqueued/memory': 10,
'start_time': datetime.datetime(2021, 11, 27, 15, 8, 24, 500861)}
2021-11-27 22:08:45 [scrapy.core.engine] INFO: Spider closed (finished)
You can use a CSS selector.
import scrapy
from scrapy import Request
class AmzsingleSpider(scrapy.Spider):
name = 'amzsingle-parse'
def start_requests(self):
for i in range(10):
yield Request(url="https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929", callback=self.parse, dont_filter=True)
def parse(self, response):
yield {
'title': response.css('#productTitle ::text').get()
}
Output
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
2021-11-27 15:56:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929> (referer: None)
2021-11-27 15:56:42 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/%C2%A1Avancemos-Student-Level-2013-Spanish/dp/0547871929>
{"title": "\n\u00a1Avancemos!: Student Edition Level 3 2013 (Spanish Edition)\n"}
please I need help. I am learning scraping and have been struggling to get it work scraping a website.
I get 0 items crawled every time. I have used user_agent and also set robot_txt = False in the settings.py and yet it doesn't work.
I notice when I use scrapy shell, I get all the details and have checked through my codes again and again to find errors but can't still find it. Please someone should help me check and tell me where I got it wrong.
spider code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from batt_data.items import BattDataItem
import urllib.parse
class BatterySpider(CrawlSpider):
name = 'battery'
allowed_domains = ['web']
start_urls = ['https://www.made-in-china.com/multi-
search/24v%2Bbattery/F1/1.html']
base_url = ['https://www.made-in-china.com/multi-
search/24v%2Bbattery/F1/1.html']
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(#class,
"nextpage")]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = BattDataItem()
item['description'] = response.xpath('//img[#class="J-firstLazyload"]/#alt').extract()
item['chemistry'] = response.xpath('//li[#class="J-faketitle ellipsis"][1]/span/text()').extract()
item['applications'] = response.xpath('//li[#class="J-faketitle ellipsis"][2]/span/text()').extract()
item['shape'] = response.xpath('//li[#class="J-faketitle ellipsis"][4]/span/text()').extract()
item['discharge_rate'] = response.xpath('//li[#class="J-faketitle ellipsis"][5]/span/text()').extract()
yield item
log file:
C:\Users\Ikeen\batt_data>scrapy crawl battery
2020-08-29 21:17:27 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: batt_data)
2020-08-29 21:17:27 [scrapy.utils.log] INFO: Versions: lxml 4.1.0.0, libxml2 2.9.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.3 |Anaconda, Inc.| (default, Oct 15 2017, 03:27:45) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.2.0 (OpenSSL 1.0.2l 25 May 2017), cryptography 2.0.3, Platform Windows-10-10.0.18362-SP0
2020-08-29 21:17:27 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-08-29 21:17:27 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'batt_data',
'NEWSPIDER_MODULE': 'batt_data.spiders',
'SPIDER_MODULES': ['batt_data.spiders'],
'USER_AGENT': 'Mozilla/5.0'}
2020-08-29 21:17:27 [scrapy.extensions.telnet] INFO: Telnet Password: 549b17173b135b6b
2020-08-29 21:17:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-08-29 21:17:28 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-08-29 21:17:28 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-08-29 21:17:28 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-08-29 21:17:28 [scrapy.core.engine] INFO: Spider opened
2020-08-29 21:17:28 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-08-29 21:17:28 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-08-29 21:17:30 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html> (referer: None)
2020-08-29 21:17:30 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.made-in-china.com': <GET https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/2.html;jsessionid=2B77F23449911847145999CD6E9B6429>
2020-08-29 21:17:30 [scrapy.core.engine] INFO: Closing spider (finished)
2020-08-29 21:17:30 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 234,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 54381,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 2.42789,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 8, 29, 20, 17, 30, 804912),
'log_count/DEBUG': 2,
'log_count/INFO': 10,
'offsite/domains': 1,
'offsite/filtered': 1,
'request_depth_max': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 8, 29, 20, 17, 28, 377022)}
2020-08-29 21:17:30 [scrapy.core.engine] INFO: Spider closed (finished)
2020-08-29 21:17:30 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.made-in-china.com': <GET https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/2.html;jsessionid=2B77F23449911847145999CD6E9B6429>
Your request is being filtered as it doesn't belong to the allowed domains that you defined.
allowed_domains = ['web']
Use allowed_domains = ['made-in-china.com'] or remove it completely.
While submitting a form to the cboe.com website, it sends a file that need to be saved to disk and then re-directs to the same form page.
How do I save the file that is returned using Scrapy? I have below code but it is not able to get a handle to the file as its getting redirected.
import scrapy
class FileDownload(scrapy.Spider):
name = 'Test'
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'txtTicker': 'AAPL'},
callback=self.after_download
)
def start_requests(self):
yield scrapy.Request(
'http://www.cboe.com/delayedquote/quote-table-download',
meta = {
'dont_redirect': True,
'handle_httpstatus_list': [302]
}
)
def after_download(self, response):
if(response is None):
print("--------Empty response for download----------")
else:
print(response)
return
log snippet of scrapy
2020-06-06 19:15:02 [scrapy.core.engine] DEBUG: Crawled (200) http://www.cboe.com/delayedquote/quote-table-download> (referer: None)
2020-06-06 19:15:14 [scrapy.extensions.logstats] INFO: Crawled 1 pages (at 1 pages/min), scraped 0 items (at 0 items/min)
2020-06-06 19:18:22 [scrapy.extensions.logstats] INFO: Crawled 1 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-06-06 19:18:27 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to http://www.cboe.com/delayedquote/quotedata.dat> from http://www.cboe.com/delayedquote/quote-table-download>
2020-06-06 19:25:54 [scrapy.extensions.logstats] INFO: Crawled 1 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-06-06 19:25:55 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to http://www.cboe.com/delayedquote/quote-table-download> from http://www.cboe.com/delayedquote/quotedata.dat>
2020-06-06 19:25:55 [scrapy.dupefilters] DEBUG: Filtered duplicate request: http://www.cboe.com/delayedquote/quote-table-download> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2020-06-06 19:25:55 [scrapy.core.engine] INFO: Closing spider (finished)
2020-06-06 19:25:55 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
http://www.cboe.com/delayedquote/quotedata.dat is data file that I need to save.
thanks
It looks like you have a wrong field name (txtTicker):
def parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={
'ctl00$ContentTop$C005$txtTicker': 'AAPL',
'ctl00$ContentTop$C005$cmdSubmit': 'Download',
},
callback=self.after_download
)
Here is how I intend this code to work;
I have a keyword, say, "gadgets". I search titles on advanced imdb search page. I want the code to go to each title page, then go to keywords page of each title and then download title and all the keywords.
The code structure looks good to me but it is really not working.
Please suggest whether it needs to be re-written or it can be corrected with some advice?
Here is my spider:
import scrapy
class KwordsSpider(scrapy.Spider):
name= 'ImdbSpider'
allowed_domains = ['imdb.com']
start_urls = [
'https://www.imdb.com/search/title/?keywords=gadgets'
]
def parse(self, response):
titleLinks = response.xpath('//*[#class="lister-item-content"]')
for link in titleLinks:
title_url = 'https://www.imdb.com'+link.xpath('.//h3/a/#href').extract_first()
yield scrapy.Request(title_url, callback=self.parse_title)
next_page_url = 'https://www.imdb.com'+response.xpath('//div[#class="article"]/div[#class="desc"]/a[#href]').extract_first()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_title(self, response):
keywords_url = 'https://www.imdb.com' + response.xpath('//nobr/a[#href]').extract_first()
yield scrapy.Request(keywords_url, callback=self.parse_keys)
#looking at the keywords page
def parse_keys(self, response):
title = response.xpath('//h3/a/text()').extract_first()
keys = response.xpath('//div[#class="sodatext"]/a/text()').extract()
print('my print'+title)
yield{
'title': title,
'Keywords': keys,
}
Following are few power shell lines
2020-05-02 08:33:40 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-05-02 08:33:40 [scrapy.core.engine] INFO: Spider opened
2020-05-02 08:33:40 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-05-02 08:33:40 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-05-02 08:33:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.imdb.com/search/title/?keywords=gadgets> (referer: None)
2020-05-02 08:33:43 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.imdb.com<a href="': <GET https://www.imdb.com<a href="/search/title/?keywords=gadgets&start=51%22%20class=%22lister-page-next%20next-page%22%3ENext%20%C2%BB%3C/a%3E>
2020-05-02 08:33:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.imdb.com/title/tt3896198/> (referer: https://www.imdb.com/search/title/?keywords=gadgets)
2020-05-02 08:34:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.imdb.com/title/tt0369171/> (referer: https://www.imdb.com/search/title/?keywords=gadgets)
2020-05-02 08:34:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.imdb.com/title/tt1149317/> (referer: https://www.imdb.com/search/title/?keywords=gadgets)
2020-05-02 08:34:11 [scrapy.core.engine] INFO: Closing spider (finished)
Few xpaths in your script were wrong. I've fixed them. It should work now.
class KwordsSpider(scrapy.Spider):
name = 'ImdbSpider'
start_urls = [
'https://www.imdb.com/search/title/?keywords=gadgets'
]
def parse(self, response):
titleLinks = response.xpath('//*[#class="lister-item-content"]')
for link in titleLinks:
title_url = response.urljoin(link.xpath('.//h3/a/#href').get())
yield scrapy.Request(title_url, callback=self.parse_title)
next_page_url = response.xpath('//div[#class="article"]/div[#class="desc"]/a/#href').get()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_title(self, response):
keywords_url = response.urljoin(response.xpath('//nobr/a/#href').get())
yield scrapy.Request(keywords_url, callback=self.parse_keys)
def parse_keys(self, response):
title = response.xpath('//h3/a/text()').get()
keys = response.xpath('//div[#class="sodatext"]/a/text()').getall()
yield {
'title': title,
'Keywords': keys,
}
I am trying to download images from different urls via scrapy. I'm new to python and scrapy so maybe I'm missing something obvious. This is my first post on stack overflow. Help would be really appreciated!
Here are my different files :
items.py
from scrapy.item import Item, Field
class ImagesTestItem(Item):
image_urls = Field()
image_names =Field()
images = Field()
pass
setting.py:
from scrapy import log
log.msg("This is a warning", level=log.WARNING)
log.msg("This is a error", level=log.ERROR)
BOT_NAME = 'images_test'
SPIDER_MODULES = ['images_test.spiders']
NEWSPIDER_MODULE = 'images_test.spiders'
ITEM_PIPELINES = {'images_test.pipelines.images_test': 1}
IMAGES_STORE = '/Users/Coralie/Documents/scrapy/images_test/images'
DOWNLOAD_DELAY = 5
STATS_CLASS = True
spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item,Field
from scrapy.utils.response import get_base_url
import logging
from scrapy.log import ScrapyFileLogObserver
logfile = open('testlog.log', 'w')
log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
log_observer.start()
class images_test(CrawlSpider):
name = "images_test"
allowed_domains = ['veranstaltungszentrum.bbaw.de']
start_urls = ['http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib0%d_g.jpg' % i for i in xrange(9) ]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
sites = hxs.select()
number = 0
for site in sites:
xpath = '//img/#src'
image_urls = hxs.select('//img/#src').extract()
item['image_urls'] = ["http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib0x_g.jpg" + x for x in image_urls]
items.append(item)
number = number + 1
return item
print item['image_urls']
pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
from PIL import Image
from scrapy import log
log.msg("This is a warning", level=log.WARNING)
log.msg("This is a error", level=log.ERROR)
scrapy.log.ERROR
class images_test(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
the log is saying the following:
/Library/Python/2.7/site-packages/Scrapy-0.20.2-py2.7.egg/scrapy/settings/deprecated.py:26: ScrapyDeprecationWarning: You are using the following settings which are deprecated or obsolete (ask scrapy-users#googlegroups.com for alternatives):
STATS_ENABLED: no longer supported (change STATS_CLASS instead)
warnings.warn(msg, ScrapyDeprecationWarning)
2014-01-03 11:36:48+0100 [scrapy] INFO: Scrapy 0.20.2 started (bot: images_test)
2014-01-03 11:36:48+0100 [scrapy] DEBUG: Optional features available: ssl, http11
2014-01-03 11:36:48+0100 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'images_test.spiders', 'SPIDER_MODULES': ['images_test.spiders'], 'DOWNLOAD_DELAY': 5, 'BOT_NAME': 'images_test'}
2014-01-03 11:36:48+0100 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2014-01-03 11:36:49+0100 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-01-03 11:36:49+0100 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2014-01-03 11:36:49+0100 [scrapy] WARNING: This is a warning
2014-01-03 11:36:49+0100 [scrapy] ERROR: This is a error
2014-01-03 11:36:49+0100 [scrapy] DEBUG: Enabled item pipelines: images_test
2014-01-03 11:36:49+0100 [images_test] INFO: Spider opened
2014-01-03 11:36:49+0100 [images_test] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-01-03 11:36:49+0100 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2014-01-03 11:36:49+0100 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2014-01-03 11:36:49+0100 [images_test] DEBUG: Crawled (404) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib00_g.jpg> (referer: None)
2014-01-03 11:36:55+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib01_g.jpg> (referer: None)
2014-01-03 11:36:59+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib02_g.jpg> (referer: None)
2014-01-03 11:37:05+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib03_g.jpg> (referer: None)
2014-01-03 11:37:10+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib04_g.jpg> (referer: None)
2014-01-03 11:37:16+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib05_g.jpg> (referer: None)
2014-01-03 11:37:22+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib06_g.jpg> (referer: None)
2014-01-03 11:37:29+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib07_g.jpg> (referer: None)
2014-01-03 11:37:36+0100 [images_test] DEBUG: Crawled (200) <GET http://veranstaltungszentrum.bbaw.de/en/photo_gallery/leib08_g.jpg> (referer: None)
2014-01-03 11:37:36+0100 [images_test] INFO: Closing spider (finished)
2014-01-03 11:37:36+0100 [images_test] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2376,
'downloader/request_count': 9,
'downloader/request_method_count/GET': 9,
'downloader/response_bytes': 343660,
'downloader/response_count': 9,
'downloader/response_status_count/200': 8,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 1, 3, 10, 37, 36, 166139),
'log_count/DEBUG': 15,
'log_count/ERROR': 1,
'log_count/INFO': 3,
'log_count/WARNING': 1,
'response_received_count': 9,
'scheduler/dequeued': 9,
'scheduler/dequeued/memory': 9,
'scheduler/enqueued': 9,
'scheduler/enqueued/memory': 9,
'start_time': datetime.datetime(2014, 1, 3, 10, 36, 49, 37947)}
2014-01-03 11:37:36+0100 [images_test] INFO: Spider closed (finished)
How come images are not getting saved? Even my print item['image_urls'] command is not being executed.
Thank you
consider changing your spider code to the following:
start_urls = ['http://veranstaltungszentrum.bbaw.de/en/photo_gallery']
def parse(self, response):
sel = HtmlXPathSelector(response)
item = ImagesTestItem()
url = 'http://veranstaltungszentrum.bbaw.de'
return item['image_urls'] = [urljoin(url, x) for x in
sel.select('//img/#src').extract())]
HtmlXPathSelector can only parse html documents, it seem that you fed it with images from your start_urls
You can try out without piplines:
def parse(self,response):
#extract your images url
imageurl = response.xpath("//img/#src").get()
imagename = imageurl.split("/")[-1]
req = urllib.request.Request(imageurl, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'})
resource = urllib.request.urlopen(req)
output = open("foldername/"+imagename,"wb")
output.write(resource.read())
output.close()