select("a/text()").extract() does not work - web-scraping

I am trying to get the text inside of href tag. Basically, I am trying to scrap android bugs at https://code.google.com/p/android/issues/list
<td class="vt col_4" width="100%" onclick="if (!cancelBubble) _goIssue(0)">
<a onclick="cancelBubble=true" href="../../android/issues/detail id=58866&colspec=ID Type Status Owner Summary Stars">
compass not showing right direktion
</a>
</td>
This is my code:
class MySpider(BaseSpider):
name = "craig"
start_urls = ["https://code.google.com/p/android/issues/list"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//td[#class='vt col_4']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["id"] = titles.select("a/text()").extract()
item ["type"] = titles.select("a/#href").extract()
items.append(item)
return items
I tested it out on other href and it works fine. Does anyone know why this won't work on the href that shows bug summary above. Thanks!

Your iteration variable has the same name as the variable that you're iterating over, which isn't a good idea. Also, you have to select every other row:
class MySpider(BaseSpider):
name = "craig"
start_urls = ["https://code.google.com/p/android/issues/list"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
table = hxs.select("//table[#id='resultstable']")
for title in table.select("tr/td[#class='vt col_4'][2]"):
item = CraiglistSampleItem()
item["id"] = title.select("a/text()").extract()
item["type"] = title.select("a/#href").extract()
yield item

Related

Scrapy isnt scraping the next page

I am trying to scrape article news from skynewsarabia.com
class SkyNewsSportsSpider(scrapy.Spider):
name = 'sky_news_sports'
sport = "https://www.skynewsarabia.com/sport/"
custom_settings = {
'FEED_EXPORT_FIELDS': ["article_content", "tags"],
}
allowed_domains = ['www.skynewsarabia.com']
first_token = "1569266773000"
scrape_this_link = "https://api.skynewsarabia.com//rest/v2/latest.json?defaultSectionId=6&nextPageToken={}&pageSize=20&types=ARTICLE"
start_urls = [scrape_this_link.format(first_token)]
urls = []
def parse(self, response):
articles = json.loads(response.text)
# to get the link for each article we need to combine both the id and the urlFriendlySuffix in one link
for article in range(0, len(articles["contentItems"])):
article_id = articles["contentItems"][article]["id"]
article_url = articles["contentItems"][article]["urlFriendlySuffix"]
relative_link = article_id + "-" + article_url
full_link = self.sport + relative_link
self.urls.append(full_link)
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse_details)
self.urls = []
print("Before Check")
self.first_token = articles["nextPageToken"]
if self.first_token is not None:
next_page = self.scrape_this_link.format(self.first_token)
print("I am inside!")
print(next_page)
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
pass
The basic idea here is that you first scrape a link which has 20 links. besides that, the first link has also a token for the next link which you need to add to the next URL so you can scrape the next 20 links. However, the problem I am facing is that when you first run the script, it is taking the next token and get all the links of that token and then it stops! so I am just scraping 20 links only! when I print the first_token it's giving me something different than 1569266773000 which is provided by default in the script.
You need to change allowed_domains = ['www.skynewsarabia.com'] to allowed_domains = ['skynewsarabia.com']. Alternatively remove the allowed_domains variable completely.
Since you have specified the hostname www Scrapy filters the requests to api.skynewsarabia.com as offsite and the calls are just being dropped.
Additional tip: Try to use self.logger.info and self.logger.debug instead of the print commands in your code.

How can i grab link from javascript.void(0) using scrapy with splash?

I want to get the url of next page of this site:
https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=4&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
is there a way?
I've tried some ways, but in vain.
import scrapy
from scrapy_splash import SplashRequest
import splash
class QuotesSpider(scrapy.Spider):
name = "Spider"
start_urls = [
'https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955'
]
splash.private_mode_enabled = False
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, formdata= {'modelStr': json.dumps({'pageSize': 100})},callback=self.parse,args={'wait': 6})
def parse(self, response):
links = response.css('span.ng-scope>a::attr(href)').extract()
urll = ['https://cadres.apec.fr' + link for link in links]
urls = urll
for url in urls:
yield SplashRequest(url=url, callback=self.parse_details,args={'wait': 8, 'private_mode_enabled': False})
def parse_details(self, response):
post = response.css('h1.text-uppercase::text').get()
salary = response.css('div.col-md-6>p::text')[0].extract()
name = response.css('p.margin-bottom-0 > strong::text').get()
reference = response.css('p.margin-bottom-5::text').get()
capturepost = response.css('div.col-md-6>p::text')[1].extract()
experience = response.css('div.col-md-6>p::text')[2].extract()
job_status = response.css('div.col-md-6>p::text')[3].extract()
profile = response.css('[id="profil-recherche"]>p::text').extract()
company = response.css('[id="entreprise"]>p::text').extract()
company_1 = '\n'.join(company)
description = response.css('[id="descriptif-du-poste"]>p::text').extract()
des = '\n'.join(description)
list = {"Name": name, 'Salary': salary, 'Post': post, 'Reference': reference, 'Experience': experience,
'Job Status': job_status, 'Profile': profile, 'Company': company_1, 'Capture of Post': capturepost,
'Description': des}
yield list
How can i get the javascript.void url?
Try to find the total number of pages and format the page number in the URL accordingly.
URL = https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
change page=1 with variable pages and iterate over the total number of items divided by 20 items per page(page count).

Scrapy spider only returning last item in list

I'm building a scraper to crawl a page and return multiple items (h3 & p tags) from within a div. For some reason, the scraper will print all 'name' fields when called, but is only saving info for the last item on the page.
Here's my code:
import scrapy
class FoodSpider(scrapy.Spider):
name = 'food'
allowed_domains = ['https://blog.feedspot.com/food_blogs/']
start_urls = ['https://blog.feedspot.com/food_blogs/']
def parse(self, response):
blogs = response.xpath("//div[#class='fsb v4']")
for blog in blogs:
names = blog.xpath('.//h3/a[#class="tlink"]/text()'[0:]).extract()
links = blog.xpath('.//p/a[#class="ext"]/#href'[0:]).extract()
locations = blog.xpath('.//p/span[#class="location"]/text()'[0:]).extract()
abouts = blog.xpath('.//p[#class="trow trow-wrap"]/text()[4]'[0:]).extract()
post_freqs = blog.xpath('.//p[#class="trow trow-wrap"]/text()[6]'[0:]).extract()
networks = blog.xpath('.//p[#class="trow trow-wrap"]/text()[9]'[0:]).extract()
for name in names:
name.split(',')
# print(name)
for link in links:
link.split(',')
for location in locations:
location.split(',')
for about in abouts:
about.split(',')
for post_freq in post_freqs:
post_freq.split(',')
for network in networks:
network.split(',')
yield {'name': name,
'link': link,
'location': location,
'about': about,
'post_freq': post_freq,
'network': network
}
Anyone have an idea on what I'm doing wrong?
If you run //div[#class='fsb v4'] in DevTools it will only return a single element
So you have to find a Selector that gets all those profile DIVs
class FoodSpider(scrapy.Spider):
name = 'food'
allowed_domains = ['https://blog.feedspot.com/food_blogs/']
start_urls = ['https://blog.feedspot.com/food_blogs/']
def parse(self, response):
for blog in response.css("p.trow.trow-wrap"):
yield {'name': blog.css(".thumb.alignnone::attr(alt)").extract_first(),
'link': "https://www.feedspot.com/?followfeedid=%s" % blog.css("::attr(data)").extract_first(),
'location': blog.css(".location::text").extract_first(),
}

Trouble importing scrapy into json

I'm trying to pull some info off of Craigslist and store it in a JSON file, but the info is getting stored a bit wrong. Instead of having an array of [title, link, location, time], I'm getting an array with all the titles, one with all the links, etc. Are my titles wrong or is the for loop itself wrong?
from scrapy.spiders import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["https://pittsburgh.craigslist.org/search/ccc"]
def parse(self, response):
titles = response.selector.xpath("//p[#class='row']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("//span[#id='titletextonly']").extract()
item["link"] = titles.xpath("a/#href").extract()
item["location"] = titles.xpath("//small").extract()
item["time"] = titles.xpath('//time').extract()
items.append(item)
return items
That's because your inner xpaths match the elements starting from the root of the tree. Instead, you need to force them to work in the context of each item by prepending a dot:
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath(".//span[#id='titletextonly']").extract()
item["link"] = title.xpath("a/#href").extract()
item["location"] = title.xpath(".//small").extract()
item["time"] = title.xpath('.//time').extract()
yield item

When to return an Item if I don't know when the spider will finish?

So my spider takes in a list of websites, and it crawls through each one via start_requests which yield request passing in item as meta.
Then, the spider explores all the internal links of a single website and collects all the external links into the item. The problem is that I don't know when the spider finishes crawling all the internal links, so I can't yield an item.
class WebsiteSpider(scrapy.Spider):
name = "web"
def start_requests(self):
filename = "websites.csv"
requests = []
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
item = Links(base_url=seed_url, on_list=[])
request = Request(seed_url, callback=self.parse_seed)
request.meta['item'] = item
requests.append(request)
return requests
except IOError:
raise scrapy.exceptions.CloseSpider("A list of websites are needed")
def parse_seed(self, response):
item = response.meta['item']
netloc = urlparse(item['base_url']).netloc
external_le = LinkExtractor(deny_domains=netloc)
external_links = external_le.extract_links(response)
for external_link in external_links:
item['on_list'].append(external_link)
internal_le = LinkExtractor(allow_domains=netloc)
internal_links = internal_le.extract_links(response)
for internal_link in internal_links:
request = Request(internal_link, callback=self.parse_seed)
request.meta['item'] = item
yield request
the start_requests method needs to yield Request objects. You don't need to return a list of requests, but only yield a Request when it is ready, this works because scrapy requests are asynchronous.
The same happens with items, you just need to yield your items whenever you think the item is ready, I would recommend for your case to just check if there are no more internal_links to yield the item, or also you can as many items as you want, and then check which one was the last (or the one with more data):
class WebsiteSpider(scrapy.Spider):
name = "web"
def start_requests(self):
filename = "websites.csv"
requests = []
try:
with open(filename, 'r') as csv_file:
reader = csv.reader(csv_file)
header = next(reader)
for row in reader:
seed_url = row[1].strip()
item = Links(base_url=seed_url, on_list=[])
yield Request(seed_url, callback=self.parse_seed, meta = {'item'=item})
except IOError:
raise scrapy.exceptions.CloseSpider("A list of websites are needed")
def parse_seed(self, response):
item = response.meta['item']
netloc = urlparse(item['base_url']).netloc
external_le = LinkExtractor(deny_domains=netloc)
external_links = external_le.extract_links(response)
for external_link in external_links:
item['on_list'].append(external_link)
internal_le = LinkExtractor(allow_domains=netloc)
internal_links = internal_le.extract_links(response)
if internal_links:
for internal_link in internal_links:
request = Request(internal_link, callback=self.parse_seed)
request.meta['item'] = item
yield request
else:
yield item
another thing you could do is create an extension to do what you need on the spider_closed method and do whatever you want knowing when the spider ended.

Resources