I'm trying to pull some info off of Craigslist and store it in a JSON file, but the info is getting stored a bit wrong. Instead of having an array of [title, link, location, time], I'm getting an array with all the titles, one with all the links, etc. Are my titles wrong or is the for loop itself wrong?
from scrapy.spiders import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["https://pittsburgh.craigslist.org/search/ccc"]
def parse(self, response):
titles = response.selector.xpath("//p[#class='row']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("//span[#id='titletextonly']").extract()
item["link"] = titles.xpath("a/#href").extract()
item["location"] = titles.xpath("//small").extract()
item["time"] = titles.xpath('//time').extract()
items.append(item)
return items
That's because your inner xpaths match the elements starting from the root of the tree. Instead, you need to force them to work in the context of each item by prepending a dot:
for title in titles:
item = CraigslistSampleItem()
item["title"] = title.xpath(".//span[#id='titletextonly']").extract()
item["link"] = title.xpath("a/#href").extract()
item["location"] = title.xpath(".//small").extract()
item["time"] = title.xpath('.//time').extract()
yield item
Related
Hi there I built a scraper using scrapy framework, it works on the first page perfectly but fails to get same data from next pages even after writing a code to crawl from next page. What am I getting wrong in my code. My items.py file is working fine too.
Here's my code
import scrapy
from amazonscraper.items import AmazonscraperItem
from scrapy.loader import ItemLoader
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.xpath('//*[#id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[52]/div/div/span/a/#href').get()
if next_page is not None:
next_page_url = 'https://www.amazon.com' + next_page
yield response.follow(next_page_url, callback = self.parse)
Here's my AmazonScraperItem
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
class AmazonscraperItem(scrapy.Item):
name = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
price = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
review = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
I have fixed the issue. There was a technical error with the code. I have updated a few things. I have updated the next page selector to get the correct URL. Secondly, we don't need to append any URL while sending a request as you are using response.follow. response.follow will automatically convert the relative URL into an absolute URL. The below code is working for multiple pages (all pagination).
class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:
l = ItemLoader(item = AmazonscraperItem(), selector = product )
l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')
yield l.load_item()
next_page = response.css('.s-pagination-next ::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
I want to get the url of next page of this site:
https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=4&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
is there a way?
I've tried some ways, but in vain.
import scrapy
from scrapy_splash import SplashRequest
import splash
class QuotesSpider(scrapy.Spider):
name = "Spider"
start_urls = [
'https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955'
]
splash.private_mode_enabled = False
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, formdata= {'modelStr': json.dumps({'pageSize': 100})},callback=self.parse,args={'wait': 6})
def parse(self, response):
links = response.css('span.ng-scope>a::attr(href)').extract()
urll = ['https://cadres.apec.fr' + link for link in links]
urls = urll
for url in urls:
yield SplashRequest(url=url, callback=self.parse_details,args={'wait': 8, 'private_mode_enabled': False})
def parse_details(self, response):
post = response.css('h1.text-uppercase::text').get()
salary = response.css('div.col-md-6>p::text')[0].extract()
name = response.css('p.margin-bottom-0 > strong::text').get()
reference = response.css('p.margin-bottom-5::text').get()
capturepost = response.css('div.col-md-6>p::text')[1].extract()
experience = response.css('div.col-md-6>p::text')[2].extract()
job_status = response.css('div.col-md-6>p::text')[3].extract()
profile = response.css('[id="profil-recherche"]>p::text').extract()
company = response.css('[id="entreprise"]>p::text').extract()
company_1 = '\n'.join(company)
description = response.css('[id="descriptif-du-poste"]>p::text').extract()
des = '\n'.join(description)
list = {"Name": name, 'Salary': salary, 'Post': post, 'Reference': reference, 'Experience': experience,
'Job Status': job_status, 'Profile': profile, 'Company': company_1, 'Capture of Post': capturepost,
'Description': des}
yield list
How can i get the javascript.void url?
Try to find the total number of pages and format the page number in the URL accordingly.
URL = https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi.html?sortsType=SCORE&sortsDirection=DESCENDING&nbParPage=20&page=1&lieux=590711&motsCles=commercial&latitude=48.862903&longitude=2.335955
change page=1 with variable pages and iterate over the total number of items divided by 20 items per page(page count).
I am doing web-scraping using beautifulsoup in python 3.7. The code below is successfully scraping date, title, tags but not the content of the articles. It is giving None instead.
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page={}'
pages = 32
for page in range(4, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "story-card75x1-text"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.thehindu.com"+_href)
except Exception as e:
continue
dateTag = soup.find("span", {"class": "dateline"})
sauce = BeautifulSoup(resp.text,"lxml")
tag = sauce.find("a", {"class": "section-name"})
titleTag = sauce.find("h1", {"class": "title"})
contentTag = sauce.find("div", {"class": "_yeti_done"})
date = None
tagName = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(tag,Tag):
tagName = tag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {tagName}\n {title}\n {content}\n')
time.sleep(3)
I don't see where is the problem as I am writing the correct class in contentTag.
Thanks.
I guess the links you would like to follow from first page to it's inner page end with .ece. I've applied that logic within the script to traverse those target pages to scrape data from. I've defined selectors for content slightly differently. Now it appears to be working correctly. The following script only scrapes data from page 1. Feel free to change it as per your requirement.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page=1'
base = "https://www.thehindu.com"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".story-card-news a[href$='.ece']"):
resp = requests.get(urljoin(base,item.get("href")))
sauce = BeautifulSoup(resp.text,"lxml")
title = item.get_text(strip=True)
content = ' '.join([item.get_text(strip=True) for item in sauce.select("[id^='content-body-'] p")])
print(f'{title}\n {content}\n')
Below is the code along with url of the site I m trying to get data from, but programs runs writing nothing to csv output file. Also, I got the xpath from chrome inspector console, so must be correct.
Here is the code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from insta.items import Insta
class instagram(CrawlSpider):
name = "instagram"
allowed_domains = ["zymanga.com"]
start_urls = ['http://zymanga.com/millionplus/%sf' % page for page in range(1,163)]
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//*[#id="username"]/')
items = []
for title in titles:
item = Insta()
item["username"] = title.select("a/text()").extract()
items.append(item)
return(items)
Im trying to scrape a list of all the hotels in San Francisco from:
http://www.tripadvisor.com/Hotels-g60713-San_Francisco_California-Hotels.html
the "Next Hotels" has unique urls:
page 2 is: /Hotels-g60713-oa30-San_Francisco_California-Hotels.html
page 3 is: /Hotels-g60713-oa60-San_Francisco_California-Hotels.html
page 4 is: /Hotels-g60713-oa90-San_Francisco_California-Hotels.html
and so on..
How can I set the crawlSpider to reach these pages
Is there a Rule that can help me in this case?
Is there a way to prioritize and make it scrape and parse these pages before anything else?
my code so far:
import beatSoup_test
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class TriAdvSpider(CrawlSpider):
name = "tripAdv"
allowed_domains = ["tripadvisor.com"]
start_urls = [
"http://www.tripadvisor.com/Hotels-g60713-San_Francisco_California-Hotels.html"
]
rules = (
Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_item', follow=True),
)
def parse_item(self, response):
beatSoup_test.getHotels(response.body_as_unicode())
where beatSoup_test is my parsing function that uses beautifulsoup.
Thanks!
If you want to scrape the data from any page. Use Xpath
That way you can scrape anything at same page.
And Use items to store the scraped data so that u can scrape as many thing you want.
Here is example how you can use it.
sites = Selector(text=response.body).xpath('//div[contains(#id, "identity")]//section/div/div/h3/a/text()')
items = []
items = myspiderBotItem()
items['title'] = sites.xpath('/text()').extract()
Like this
class TriAdvSpider(CrawlSpider):
name = "tripAdv"
allowed_domains = ["tripadvisor.com"]
start_urls = [
"http://www.tripadvisor.com/Hotels-g60713-San_Francisco_California-Hotels.html"
]
rules = (
Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_item', follow=True),
)
def parse_item(self, response):
# beatSoup_test.getHotels(response.body_as_unicode())
l = XPathItemLoader(item = TriAdvItem(),response = response)
for i in range(1,8):
l.add_xpath('day','//*[#id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/th[#scope="row"]/text()')
l.add_xpath('timings1','//*[#id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/td[1]/span[1]/text()')
l.add_xpath('timings2','//*[#id="super-container"]/div/div[1]/div[2]/div[2]/div[1]/table/tbody/tr['+str(i)+']/td[1]/span[2]/text()')
return l.load_item()