How can I start a brand new request in Scrapy crawler? - http

I am scraping from a website that will give every request session a sid, after getting the sid, I perform further search query with this sid and scrape the results.
I want to change the sid every time I've finished scraping all results of a single query, I've tried clearing the cookies but it doesn't work.
However, if I restart my crawler, it wll get a different sid each time, I just don't know how to get a new sid without restart the crawler.
I am wondering if there're something else that let the server know two requests are from the same connection.
Thanks!
Here is my current code:
class MySpider(scrapy.Spider):
name = 'my_spider'
allowed_domains = ['xxx.com']
start_urls = ['http://xxx/']
sid_pattern = r'SID=(\w+)&'
SID = None
query_list = ['aaa', 'bbb', 'ccc']
i = 0
def parse(self, response):
if self.i >= len(self.query_list):
return
pattern = re.compile(self.sid_pattern)
result = re.search(pattern, response.url)
if result is not None:
self.SID = result.group(1)
else:
exit(-1)
search_url = 'http://xxxx/AdvancedSearch.do'
query = self.query_list[i]
self.i += 1
query_form = {
'aaa':'bbb'
}
yield FormRequest(adv_search_url, method='POST', formdata=query_form, dont_filter=True,
callback=self.parse_result_entry)
yield Request(self.start_urls[0], cookies={}, callback=self.parse,dont_filter=True)
def parse_result(self, response):
do something

Setting COOKIES_ENABLED = False can achieve this, but is there another way other than a global settings?

Related

Scrapy isnt scraping the next page

I am trying to scrape article news from skynewsarabia.com
class SkyNewsSportsSpider(scrapy.Spider):
name = 'sky_news_sports'
sport = "https://www.skynewsarabia.com/sport/"
custom_settings = {
'FEED_EXPORT_FIELDS': ["article_content", "tags"],
}
allowed_domains = ['www.skynewsarabia.com']
first_token = "1569266773000"
scrape_this_link = "https://api.skynewsarabia.com//rest/v2/latest.json?defaultSectionId=6&nextPageToken={}&pageSize=20&types=ARTICLE"
start_urls = [scrape_this_link.format(first_token)]
urls = []
def parse(self, response):
articles = json.loads(response.text)
# to get the link for each article we need to combine both the id and the urlFriendlySuffix in one link
for article in range(0, len(articles["contentItems"])):
article_id = articles["contentItems"][article]["id"]
article_url = articles["contentItems"][article]["urlFriendlySuffix"]
relative_link = article_id + "-" + article_url
full_link = self.sport + relative_link
self.urls.append(full_link)
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse_details)
self.urls = []
print("Before Check")
self.first_token = articles["nextPageToken"]
if self.first_token is not None:
next_page = self.scrape_this_link.format(self.first_token)
print("I am inside!")
print(next_page)
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
pass
The basic idea here is that you first scrape a link which has 20 links. besides that, the first link has also a token for the next link which you need to add to the next URL so you can scrape the next 20 links. However, the problem I am facing is that when you first run the script, it is taking the next token and get all the links of that token and then it stops! so I am just scraping 20 links only! when I print the first_token it's giving me something different than 1569266773000 which is provided by default in the script.
You need to change allowed_domains = ['www.skynewsarabia.com'] to allowed_domains = ['skynewsarabia.com']. Alternatively remove the allowed_domains variable completely.
Since you have specified the hostname www Scrapy filters the requests to api.skynewsarabia.com as offsite and the calls are just being dropped.
Additional tip: Try to use self.logger.info and self.logger.debug instead of the print commands in your code.

How can i grab link from javascript.void(0) using scrapy with splash?

I want to get the url of next page of this site:
https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=4&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
is there a way?
I've tried some ways, but in vain.
import scrapy
from scrapy_splash import SplashRequest
import splash
class QuotesSpider(scrapy.Spider):
name = "Spider"
start_urls = [
'https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955'
]
splash.private_mode_enabled = False
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, formdata= {'modelStr': json.dumps({'pageSize': 100})},callback=self.parse,args={'wait': 6})
def parse(self, response):
links = response.css('span.ng-scope>a::attr(href)').extract()
urll = ['https://cadres.apec.fr' + link for link in links]
urls = urll
for url in urls:
yield SplashRequest(url=url, callback=self.parse_details,args={'wait': 8, 'private_mode_enabled': False})
def parse_details(self, response):
post = response.css('h1.text-uppercase::text').get()
salary = response.css('div.col-md-6>p::text')[0].extract()
name = response.css('p.margin-bottom-0 > strong::text').get()
reference = response.css('p.margin-bottom-5::text').get()
capturepost = response.css('div.col-md-6>p::text')[1].extract()
experience = response.css('div.col-md-6>p::text')[2].extract()
job_status = response.css('div.col-md-6>p::text')[3].extract()
profile = response.css('[id="profil-recherche"]>p::text').extract()
company = response.css('[id="entreprise"]>p::text').extract()
company_1 = '\n'.join(company)
description = response.css('[id="descriptif-du-poste"]>p::text').extract()
des = '\n'.join(description)
list = {"Name": name, 'Salary': salary, 'Post': post, 'Reference': reference, 'Experience': experience,
'Job Status': job_status, 'Profile': profile, 'Company': company_1, 'Capture of Post': capturepost,
'Description': des}
yield list
How can i get the javascript.void url?
Try to find the total number of pages and format the page number in the URL accordingly.
URL = https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
change page=1 with variable pages and iterate over the total number of items divided by 20 items per page(page count).

How to stop the Crawler

I am trying to write a crawler that goes to a website and searches for a list of keywords, with max_Depth of 2. But the scraper is supposed to stop once any of the keyword's appears on any page, the problem i am facing right now is that the crawler does-not stop when it first see's any of the keywords.
Even after trying, early return command, break command and CloseSpider Commands and even python exit commands.
My class of the Crawler:
class WebsiteSpider(CrawlSpider):
name = "webcrawler"
allowed_domains = ["www.roomtoread.org"]
start_urls = ["https://"+"www.roomtoread.org"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
wordlist = [
"sfdc",
"pardot",
"Web-to-Lead",
"salesforce"
]
url = response.url
contenttype = response.headers.get("content-type", "").decode('utf-8').lower()
data = response.body.decode('utf-8')
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
if self.__class__.words_found==0:
self.__class__.words_found += 1
print(word + "," + url + ";")
STOP!
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want it to stop execution when if not ok: is True.
When I want to stop a spider, I usually use the exception exception scrapy.exceptions.CloseSpider(reason='cancelled') from Scrapy-Docs.
The example there shows how you can use it:
if 'Bandwidth exceeded' in response.body:
raise CloseSpider('bandwidth_exceeded')
In your case something like
if not ok:
raise CloseSpider('keyword_found')
Or is that what you meant with
CloseSpider Commands
and already tried it?

How to scrape multiple URLs with same parse using Scrapy?

Hi I am having a problem regarding my spider script, I wanted to make my script readable as possible and I wanted to save code as much as possible. Is it possible to use same parse on different URL?
I wanted to scrape 10 items per page only and save it on different items function in items.py
Here's my code
def start_requests(self): #I have 3 URL's Here
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/6/posts', self.parse) #Url 1
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/7/posts', self.parse) #Url 2
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/1/posts', self.parse) #Url 3
def parse(self, response): #My logic is something like this
if Url == Url1:
item = TmcnfSpiderItem()
elif Url == Url2:
item = TmcnfSpiderItem2()
elif Url == Url3:
item = TmcnfSpiderItem3()
if count <= 9:
count += 1
info = response.css("[id^='fc-post-" + postno_only +"']")
author = info.xpath("#data-author").extract_first()
item['author'] = author
yield item
else:
#Move to next URL and perform same parse
Any idea?
I think you can try to pass all data from start_requests, like here:
def start_requests(self):
urls = (
('https://teslamotorsclub.com/tmc/post-ratings/6/posts', TmcnfSpiderItem),
('https://teslamotorsclub.com/tmc/post-ratings/7/posts', TmcnfSpiderItem2),
('https://teslamotorsclub.com/tmc/post-ratings/1/posts', TmcnfSpiderItem3),
)
for url, itemclass in urls:
yield scrapy.Request(url, meta={'itemclass': itemclass})
def parse(self, response):
item = response.meta['itemclass']()
So you pass your item classname for each url, and in parse function create new element of this class.

When to return an Item if I don't know when the spider will finish?

So my spider takes in a list of websites, and it crawls through each one via start_requests which yield request passing in item as meta.
Then, the spider explores all the internal links of a single website and collects all the external links into the item. The problem is that I don't know when the spider finishes crawling all the internal links, so I can't yield an item.
class WebsiteSpider(scrapy.Spider):
name = "web"
def start_requests(self):
filename = "websites.csv"
requests = []
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
item = Links(base_url=seed_url, on_list=[])
request = Request(seed_url, callback=self.parse_seed)
request.meta['item'] = item
requests.append(request)
return requests
except IOError:
raise scrapy.exceptions.CloseSpider("A list of websites are needed")
def parse_seed(self, response):
item = response.meta['item']
netloc = urlparse(item['base_url']).netloc
external_le = LinkExtractor(deny_domains=netloc)
external_links = external_le.extract_links(response)
for external_link in external_links:
item['on_list'].append(external_link)
internal_le = LinkExtractor(allow_domains=netloc)
internal_links = internal_le.extract_links(response)
for internal_link in internal_links:
request = Request(internal_link, callback=self.parse_seed)
request.meta['item'] = item
yield request
the start_requests method needs to yield Request objects. You don't need to return a list of requests, but only yield a Request when it is ready, this works because scrapy requests are asynchronous.
The same happens with items, you just need to yield your items whenever you think the item is ready, I would recommend for your case to just check if there are no more internal_links to yield the item, or also you can as many items as you want, and then check which one was the last (or the one with more data):
class WebsiteSpider(scrapy.Spider):
name = "web"
def start_requests(self):
filename = "websites.csv"
requests = []
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
item = Links(base_url=seed_url, on_list=[])
yield Request(seed_url, callback=self.parse_seed, meta = {'item'=item})
except IOError:
raise scrapy.exceptions.CloseSpider("A list of websites are needed")
def parse_seed(self, response):
item = response.meta['item']
netloc = urlparse(item['base_url']).netloc
external_le = LinkExtractor(deny_domains=netloc)
external_links = external_le.extract_links(response)
for external_link in external_links:
item['on_list'].append(external_link)
internal_le = LinkExtractor(allow_domains=netloc)
internal_links = internal_le.extract_links(response)
if internal_links:
for internal_link in internal_links:
request = Request(internal_link, callback=self.parse_seed)
request.meta['item'] = item
yield request
else:
yield item
another thing you could do is create an extension to do what you need on the spider_closed method and do whatever you want knowing when the spider ended.

Resources