How to scrape multiple URLs with same parse using Scrapy? - web-scraping

Hi I am having a problem regarding my spider script, I wanted to make my script readable as possible and I wanted to save code as much as possible. Is it possible to use same parse on different URL?
I wanted to scrape 10 items per page only and save it on different items function in items.py
Here's my code
def start_requests(self): #I have 3 URL's Here
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/6/posts', self.parse) #Url 1
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/7/posts', self.parse) #Url 2
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/1/posts', self.parse) #Url 3
def parse(self, response): #My logic is something like this
if Url == Url1:
item = TmcnfSpiderItem()
elif Url == Url2:
item = TmcnfSpiderItem2()
elif Url == Url3:
item = TmcnfSpiderItem3()
if count <= 9:
count += 1
info = response.css("[id^='fc-post-" + postno_only +"']")
author = info.xpath("#data-author").extract_first()
item['author'] = author
yield item
else:
#Move to next URL and perform same parse
Any idea?

I think you can try to pass all data from start_requests, like here:
def start_requests(self):
urls = (
('https://teslamotorsclub.com/tmc/post-ratings/6/posts', TmcnfSpiderItem),
('https://teslamotorsclub.com/tmc/post-ratings/7/posts', TmcnfSpiderItem2),
('https://teslamotorsclub.com/tmc/post-ratings/1/posts', TmcnfSpiderItem3),
)
for url, itemclass in urls:
yield scrapy.Request(url, meta={'itemclass': itemclass})
def parse(self, response):
item = response.meta['itemclass']()
So you pass your item classname for each url, and in parse function create new element of this class.

Related

Scrapy isnt scraping the next page

I am trying to scrape article news from skynewsarabia.com
class SkyNewsSportsSpider(scrapy.Spider):
name = 'sky_news_sports'
sport = "https://www.skynewsarabia.com/sport/"
custom_settings = {
'FEED_EXPORT_FIELDS': ["article_content", "tags"],
}
allowed_domains = ['www.skynewsarabia.com']
first_token = "1569266773000"
scrape_this_link = "https://api.skynewsarabia.com//rest/v2/latest.json?defaultSectionId=6&nextPageToken={}&pageSize=20&types=ARTICLE"
start_urls = [scrape_this_link.format(first_token)]
urls = []
def parse(self, response):
articles = json.loads(response.text)
# to get the link for each article we need to combine both the id and the urlFriendlySuffix in one link
for article in range(0, len(articles["contentItems"])):
article_id = articles["contentItems"][article]["id"]
article_url = articles["contentItems"][article]["urlFriendlySuffix"]
relative_link = article_id + "-" + article_url
full_link = self.sport + relative_link
self.urls.append(full_link)
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse_details)
self.urls = []
print("Before Check")
self.first_token = articles["nextPageToken"]
if self.first_token is not None:
next_page = self.scrape_this_link.format(self.first_token)
print("I am inside!")
print(next_page)
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
pass
The basic idea here is that you first scrape a link which has 20 links. besides that, the first link has also a token for the next link which you need to add to the next URL so you can scrape the next 20 links. However, the problem I am facing is that when you first run the script, it is taking the next token and get all the links of that token and then it stops! so I am just scraping 20 links only! when I print the first_token it's giving me something different than 1569266773000 which is provided by default in the script.
You need to change allowed_domains = ['www.skynewsarabia.com'] to allowed_domains = ['skynewsarabia.com']. Alternatively remove the allowed_domains variable completely.
Since you have specified the hostname www Scrapy filters the requests to api.skynewsarabia.com as offsite and the calls are just being dropped.
Additional tip: Try to use self.logger.info and self.logger.debug instead of the print commands in your code.

How to stop the Crawler

I am trying to write a crawler that goes to a website and searches for a list of keywords, with max_Depth of 2. But the scraper is supposed to stop once any of the keyword's appears on any page, the problem i am facing right now is that the crawler does-not stop when it first see's any of the keywords.
Even after trying, early return command, break command and CloseSpider Commands and even python exit commands.
My class of the Crawler:
class WebsiteSpider(CrawlSpider):
name = "webcrawler"
allowed_domains = ["www.roomtoread.org"]
start_urls = ["https://"+"www.roomtoread.org"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
wordlist = [
"sfdc",
"pardot",
"Web-to-Lead",
"salesforce"
]
url = response.url
contenttype = response.headers.get("content-type", "").decode('utf-8').lower()
data = response.body.decode('utf-8')
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
if self.__class__.words_found==0:
self.__class__.words_found += 1
print(word + "," + url + ";")
STOP!
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want it to stop execution when if not ok: is True.
When I want to stop a spider, I usually use the exception exception scrapy.exceptions.CloseSpider(reason='cancelled') from Scrapy-Docs.
The example there shows how you can use it:
if 'Bandwidth exceeded' in response.body:
raise CloseSpider('bandwidth_exceeded')
In your case something like
if not ok:
raise CloseSpider('keyword_found')
Or is that what you meant with
CloseSpider Commands
and already tried it?

How can I start a brand new request in Scrapy crawler?

I am scraping from a website that will give every request session a sid, after getting the sid, I perform further search query with this sid and scrape the results.
I want to change the sid every time I've finished scraping all results of a single query, I've tried clearing the cookies but it doesn't work.
However, if I restart my crawler, it wll get a different sid each time, I just don't know how to get a new sid without restart the crawler.
I am wondering if there're something else that let the server know two requests are from the same connection.
Thanks!
Here is my current code:
class MySpider(scrapy.Spider):
name = 'my_spider'
allowed_domains = ['xxx.com']
start_urls = ['http://xxx/']
sid_pattern = r'SID=(\w+)&'
SID = None
query_list = ['aaa', 'bbb', 'ccc']
i = 0
def parse(self, response):
if self.i >= len(self.query_list):
return
pattern = re.compile(self.sid_pattern)
result = re.search(pattern, response.url)
if result is not None:
self.SID = result.group(1)
else:
exit(-1)
search_url = 'http://xxxx/AdvancedSearch.do'
query = self.query_list[i]
self.i += 1
query_form = {
'aaa':'bbb'
}
yield FormRequest(adv_search_url, method='POST', formdata=query_form, dont_filter=True,
callback=self.parse_result_entry)
yield Request(self.start_urls[0], cookies={}, callback=self.parse,dont_filter=True)
def parse_result(self, response):
do something
Setting COOKIES_ENABLED = False can achieve this, but is there another way other than a global settings?

When to return an Item if I don't know when the spider will finish?

So my spider takes in a list of websites, and it crawls through each one via start_requests which yield request passing in item as meta.
Then, the spider explores all the internal links of a single website and collects all the external links into the item. The problem is that I don't know when the spider finishes crawling all the internal links, so I can't yield an item.
class WebsiteSpider(scrapy.Spider):
name = "web"
def start_requests(self):
filename = "websites.csv"
requests = []
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
item = Links(base_url=seed_url, on_list=[])
request = Request(seed_url, callback=self.parse_seed)
request.meta['item'] = item
requests.append(request)
return requests
except IOError:
raise scrapy.exceptions.CloseSpider("A list of websites are needed")
def parse_seed(self, response):
item = response.meta['item']
netloc = urlparse(item['base_url']).netloc
external_le = LinkExtractor(deny_domains=netloc)
external_links = external_le.extract_links(response)
for external_link in external_links:
item['on_list'].append(external_link)
internal_le = LinkExtractor(allow_domains=netloc)
internal_links = internal_le.extract_links(response)
for internal_link in internal_links:
request = Request(internal_link, callback=self.parse_seed)
request.meta['item'] = item
yield request
the start_requests method needs to yield Request objects. You don't need to return a list of requests, but only yield a Request when it is ready, this works because scrapy requests are asynchronous.
The same happens with items, you just need to yield your items whenever you think the item is ready, I would recommend for your case to just check if there are no more internal_links to yield the item, or also you can as many items as you want, and then check which one was the last (or the one with more data):
class WebsiteSpider(scrapy.Spider):
name = "web"
def start_requests(self):
filename = "websites.csv"
requests = []
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
item = Links(base_url=seed_url, on_list=[])
yield Request(seed_url, callback=self.parse_seed, meta = {'item'=item})
except IOError:
raise scrapy.exceptions.CloseSpider("A list of websites are needed")
def parse_seed(self, response):
item = response.meta['item']
netloc = urlparse(item['base_url']).netloc
external_le = LinkExtractor(deny_domains=netloc)
external_links = external_le.extract_links(response)
for external_link in external_links:
item['on_list'].append(external_link)
internal_le = LinkExtractor(allow_domains=netloc)
internal_links = internal_le.extract_links(response)
if internal_links:
for internal_link in internal_links:
request = Request(internal_link, callback=self.parse_seed)
request.meta['item'] = item
yield request
else:
yield item
another thing you could do is create an extension to do what you need on the spider_closed method and do whatever you want knowing when the spider ended.

Django-MPTT full path to child pages how to make?

I'm start using Django-MPTT app to get a tree-based approach on my Django-site pages.
For ex. I have pages with sub pages:
Trance:
Vocal Trance(sub page)
Hard Trance(sub page)
Breaks:
Atmo Breaks(sub page)
Progressive Breaks(sub page)
How can I get access to them from urls.py?
What pattern will help?
Do I need to store Full_path in model or it can be done via url pattern?
I assume you mean you want to do URLs like this:
/trance/
/trance/vocal-trance/
/trance/hard-trace/
/breaks/
/breaks/atmo-breaks/
/breaks/progressive-breaks/
If so, it's probably best to store the url fragment in your model. Something like:
from mptt.models import MPTTModel
from django.db import models
from django.template.defaultfilters import slugify
class Page(MPTTModel):
name = models.CharField(max_length=50)
slug = models.CharField(max_length=50,null=True)
url = models.CharField(max_length=255,null=True)
def save(self, *args, **kwargs)
if self.slug is None:
# create a slug that's unique to siblings
slug = slugify(self.name)
self.slug = slug
siblings = self.get_siblings()
i = 1
while siblings.filter(slug=self.slug).exists():
i += 1
self.slug = slug + '-%d' % i
# now create a URL based on parent's url + slug
if self.parent:
self.url = '%s/%s' % (self.parent.url, self.slug)
else:
self.url = self.slug
super(Page, self).save(*args, **kwargs)
Then add a URL pattern:
(r'^pages/(?P<page_url>[\w\d_/-]+)/$', 'pages.views.show_page'),
And in your view you can just fetch the right page:
def show_page(request, page_url=None):
page = get_object_or_404(Page, url=page_url)
...
Thank you for your attention to my problem.
See,How I finally do it.
models.py
class WebPage(MPTTModel):
slug=RuSlugField(max_length=20,unique=True)
title=models.CharField(max_length=50)
content=models.TextField()
parent=TreeForeignKey('self',null=True,blank=True,related_name='children')
class MPTTMeta:
order_insertion_by=['slug']
def get_absolute_url(self):#TODO:: replace with get_ancestors
url = "/%s/" % self.slug
page = self
while page.parent:
url = "/%s%s" % (page.parent.slug,url)
page = page.parent
return url
urls.py
urlpatterns = patterns('website.views',
url(r"^add/$", "add_page",name="add"),
url(r"^(?P<full_slug>.*)/add/$", "add_page",name="add"),
url(r"^(?P<full_slug>.*)/edit/$", "edit_page",name="edit"),
url(r'^$', ListView.as_view(model=WebPage,template_name='index.html',context_object_name="webpages_list",),name='index'),
url(r"^(?P<full_slug>.*)/$", "page", name="page"),
)
views.py
def page(request, full_slug):
# Make a list from full_slug.
# For ex. /trance/progressive_trance/fonarev -> ['trance','progressive_trance','fonarev']
slugs=full_slug.split('/')
page=None
# Get a page by it's slug
if len(slugs)>1:
page=get_object_or_404(WebPage,slug=slugs[-1])#slugs=['trance','vocal_trance'] -> 'vocal_trance'
elif len(slugs)==1:
page=get_object_or_404(WebPage,slug=slugs[0])#slugs=['trance'] -> 'trance'
# Check if page url matches requested full_slug
if page.get_absolute_url().strip('/') == full_slug:
return render_to_response('page.html', {'page': page},context_instance=RequestContext(request))
else:
raise Http404
def edit_page(request,full_slug):
slugs=full_slug.split('/')
page=None
if len(slugs)>1:
page=get_object_or_404(WebPage,slug=slugs[-1])
elif len(slugs)==1:
page=get_object_or_404(WebPage,slug=slugs[0])
if not page.get_absolute_url().strip('/') == full_slug:
raise Http404
# Send POST data for update an existing page.Update a page.
if request.method=='POST':
form=WebPageForm(request.POST, instance=page)
if form.is_valid():
form.save()
return HttpResponseRedirect(page.get_absolute_url())
# Render a form to edit data for existing page
else:
form=WebPageForm(instance=page)
return render_to_response('edit_page.html',{'form':form,},context_instance=RequestContext(request))
def add_page(request,full_slug=None):
parent_page=None
slug=None
if full_slug:
slug=full_slug.split('/')
# If there is a slug in REQUEST(ex.'trance')->we need to add a new_page to her parent_page.
# So get a parent page.
if slug:
if len(slug)>1:
parent_page=get_object_or_404(WebPage,slug=slug[-1])
elif len(slug)==1:
parent_page=get_object_or_404(WebPage,slug=slug[0])
# Create a new_page
if request.method=='POST':
form=WebPageForm(request.POST)
if form.is_valid():
new_page=form.save(commit=False)
if parent_page:
new_page.parent=parent_page
new_page.save()
return HttpResponseRedirect(new_page.get_absolute_url())
# Return an unbounded form
else:
form=WebPageForm()
return render_to_response('add_page.html',{'form':form,},context_instance=RequestContext(request))
The trick is in we have to check if the page really exists accessing to it via full_slug:
if not page.get_absolute_url().strip('/') == full_slug:
raise Http404
Otherwise, it could be wrong allowing to check only by slug.
There's also a django app that will do the work for you: django-mptt-urls
def get_absolute_url(self):
return '/'.join([x['slug'] for x in self.get_ancestors(include_self=True).values()])

Resources