I am trying to scrape article news from skynewsarabia.com
class SkyNewsSportsSpider(scrapy.Spider):
name = 'sky_news_sports'
sport = "https://www.skynewsarabia.com/sport/"
custom_settings = {
'FEED_EXPORT_FIELDS': ["article_content", "tags"],
}
allowed_domains = ['www.skynewsarabia.com']
first_token = "1569266773000"
scrape_this_link = "https://api.skynewsarabia.com//rest/v2/latest.json?defaultSectionId=6&nextPageToken={}&pageSize=20&types=ARTICLE"
start_urls = [scrape_this_link.format(first_token)]
urls = []
def parse(self, response):
articles = json.loads(response.text)
# to get the link for each article we need to combine both the id and the urlFriendlySuffix in one link
for article in range(0, len(articles["contentItems"])):
article_id = articles["contentItems"][article]["id"]
article_url = articles["contentItems"][article]["urlFriendlySuffix"]
relative_link = article_id + "-" + article_url
full_link = self.sport + relative_link
self.urls.append(full_link)
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse_details)
self.urls = []
print("Before Check")
self.first_token = articles["nextPageToken"]
if self.first_token is not None:
next_page = self.scrape_this_link.format(self.first_token)
print("I am inside!")
print(next_page)
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
pass
The basic idea here is that you first scrape a link which has 20 links. besides that, the first link has also a token for the next link which you need to add to the next URL so you can scrape the next 20 links. However, the problem I am facing is that when you first run the script, it is taking the next token and get all the links of that token and then it stops! so I am just scraping 20 links only! when I print the first_token it's giving me something different than 1569266773000 which is provided by default in the script.
You need to change allowed_domains = ['www.skynewsarabia.com'] to allowed_domains = ['skynewsarabia.com']. Alternatively remove the allowed_domains variable completely.
Since you have specified the hostname www Scrapy filters the requests to api.skynewsarabia.com as offsite and the calls are just being dropped.
Additional tip: Try to use self.logger.info and self.logger.debug instead of the print commands in your code.
Related
Hi there I built a scraper using scrapy framework, it works on the first page perfectly but fails to get same data from next pages even after writing a code to crawl from next page. What am I getting wrong in my code. My items.py file is working fine too.
Here's my code
import scrapy
from amazonscraper.items import AmazonscraperItem
from scrapy.loader import ItemLoader
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.xpath('//*[#id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[52]/div/div/span/a/#href').get()
if next_page is not None:
next_page_url = 'https://www.amazon.com' + next_page
yield response.follow(next_page_url, callback = self.parse)
Here's my AmazonScraperItem
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class AmazonscraperItem(scrapy.Item):
name = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
price = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
review = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
I have fixed the issue. There was a technical error with the code. I have updated a few things. I have updated the next page selector to get the correct URL. Secondly, we don't need to append any URL while sending a request as you are using response.follow. response.follow will automatically convert the relative URL into an absolute URL. The below code is working for multiple pages (all pagination).
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.css('.s-pagination-next ::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
I want to get the url of next page of this site:
https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=4&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
is there a way?
I've tried some ways, but in vain.
import scrapy
from scrapy_splash import SplashRequest
import splash
class QuotesSpider(scrapy.Spider):
name = "Spider"
start_urls = [
'https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955'
]
splash.private_mode_enabled = False
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, formdata= {'modelStr': json.dumps({'pageSize': 100})},callback=self.parse,args={'wait': 6})
def parse(self, response):
links = response.css('span.ng-scope>a::attr(href)').extract()
urll = ['https://cadres.apec.fr' + link for link in links]
urls = urll
for url in urls:
yield SplashRequest(url=url, callback=self.parse_details,args={'wait': 8, 'private_mode_enabled': False})
def parse_details(self, response):
post = response.css('h1.text-uppercase::text').get()
salary = response.css('div.col-md-6>p::text')[0].extract()
name = response.css('p.margin-bottom-0 > strong::text').get()
reference = response.css('p.margin-bottom-5::text').get()
capturepost = response.css('div.col-md-6>p::text')[1].extract()
experience = response.css('div.col-md-6>p::text')[2].extract()
job_status = response.css('div.col-md-6>p::text')[3].extract()
profile = response.css('[id="profil-recherche"]>p::text').extract()
company = response.css('[id="entreprise"]>p::text').extract()
company_1 = '\n'.join(company)
description = response.css('[id="descriptif-du-poste"]>p::text').extract()
des = '\n'.join(description)
list = {"Name": name, 'Salary': salary, 'Post': post, 'Reference': reference, 'Experience': experience,
'Job Status': job_status, 'Profile': profile, 'Company': company_1, 'Capture of Post': capturepost,
'Description': des}
yield list
How can i get the javascript.void url?
Try to find the total number of pages and format the page number in the URL accordingly.
URL = https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
change page=1 with variable pages and iterate over the total number of items divided by 20 items per page(page count).
Hi I am having a problem regarding my spider script, I wanted to make my script readable as possible and I wanted to save code as much as possible. Is it possible to use same parse on different URL?
I wanted to scrape 10 items per page only and save it on different items function in items.py
Here's my code
def start_requests(self): #I have 3 URL's Here
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/6/posts', self.parse) #Url 1
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/7/posts', self.parse) #Url 2
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/1/posts', self.parse) #Url 3
def parse(self, response): #My logic is something like this
if Url == Url1:
item = TmcnfSpiderItem()
elif Url == Url2:
item = TmcnfSpiderItem2()
elif Url == Url3:
item = TmcnfSpiderItem3()
if count <= 9:
count += 1
info = response.css("[id^='fc-post-" + postno_only +"']")
author = info.xpath("#data-author").extract_first()
item['author'] = author
yield item
else:
#Move to next URL and perform same parse
Any idea?
I think you can try to pass all data from start_requests, like here:
def start_requests(self):
urls = (
('https://teslamotorsclub.com/tmc/post-ratings/6/posts', TmcnfSpiderItem),
('https://teslamotorsclub.com/tmc/post-ratings/7/posts', TmcnfSpiderItem2),
('https://teslamotorsclub.com/tmc/post-ratings/1/posts', TmcnfSpiderItem3),
)
for url, itemclass in urls:
yield scrapy.Request(url, meta={'itemclass': itemclass})
def parse(self, response):
item = response.meta['itemclass']()
So you pass your item classname for each url, and in parse function create new element of this class.
I'm building a scraper to crawl a page and return multiple items (h3 & p tags) from within a div. For some reason, the scraper will print all 'name' fields when called, but is only saving info for the last item on the page.
Here's my code:
import scrapy
class FoodSpider(scrapy.Spider):
name = 'food'
allowed_domains = ['https://blog.feedspot.com/food_blogs/']
start_urls = ['https://blog.feedspot.com/food_blogs/']
def parse(self, response):
blogs = response.xpath("//div[#class='fsb v4']")
for blog in blogs:
names = blog.xpath('.//h3/a[#class="tlink"]/text()'[0:]).extract()
links = blog.xpath('.//p/a[#class="ext"]/#href'[0:]).extract()
locations = blog.xpath('.//p/span[#class="location"]/text()'[0:]).extract()
abouts = blog.xpath('.//p[#class="trow trow-wrap"]/text()[4]'[0:]).extract()
post_freqs = blog.xpath('.//p[#class="trow trow-wrap"]/text()[6]'[0:]).extract()
networks = blog.xpath('.//p[#class="trow trow-wrap"]/text()[9]'[0:]).extract()
for name in names:
name.split(',')
# print(name)
for link in links:
link.split(',')
for location in locations:
location.split(',')
for about in abouts:
about.split(',')
for post_freq in post_freqs:
post_freq.split(',')
for network in networks:
network.split(',')
yield {'name': name,
'link': link,
'location': location,
'about': about,
'post_freq': post_freq,
'network': network
}
Anyone have an idea on what I'm doing wrong?
If you run //div[#class='fsb v4'] in DevTools it will only return a single element
So you have to find a Selector that gets all those profile DIVs
class FoodSpider(scrapy.Spider):
name = 'food'
allowed_domains = ['https://blog.feedspot.com/food_blogs/']
start_urls = ['https://blog.feedspot.com/food_blogs/']
def parse(self, response):
for blog in response.css("p.trow.trow-wrap"):
yield {'name': blog.css(".thumb.alignnone::attr(alt)").extract_first(),
'link': "https://www.feedspot.com/?followfeedid=%s" % blog.css("::attr(data)").extract_first(),
'location': blog.css(".location::text").extract_first(),
}
I am scraping from a website that will give every request session a sid, after getting the sid, I perform further search query with this sid and scrape the results.
I want to change the sid every time I've finished scraping all results of a single query, I've tried clearing the cookies but it doesn't work.
However, if I restart my crawler, it wll get a different sid each time, I just don't know how to get a new sid without restart the crawler.
I am wondering if there're something else that let the server know two requests are from the same connection.
Thanks!
Here is my current code:
class MySpider(scrapy.Spider):
name = 'my_spider'
allowed_domains = ['xxx.com']
start_urls = ['http://xxx/']
sid_pattern = r'SID=(\w+)&'
SID = None
query_list = ['aaa', 'bbb', 'ccc']
i = 0
def parse(self, response):
if self.i >= len(self.query_list):
return
pattern = re.compile(self.sid_pattern)
result = re.search(pattern, response.url)
if result is not None:
self.SID = result.group(1)
else:
exit(-1)
search_url = 'http://xxxx/AdvancedSearch.do'
query = self.query_list[i]
self.i += 1
query_form = {
'aaa':'bbb'
}
yield FormRequest(adv_search_url, method='POST', formdata=query_form, dont_filter=True,
callback=self.parse_result_entry)
yield Request(self.start_urls[0], cookies={}, callback=self.parse,dont_filter=True)
def parse_result(self, response):
do something
Setting COOKIES_ENABLED = False can achieve this, but is there another way other than a global settings?