Scrapy IdentationError: expected an idented block - web-scraping

Trust you are doing well. Please I need your help, I´m obtaining this error but I don´t know why:
File "C:\Users\Luis\Amazon\mercado\spiders\spider.py", line 14
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
^IndentationError: expected an indented block
# -*- coding: utf-8 -*-
import scrapy
import urllib
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['https://www.amazon.es']
start_urls = ['https://www.amazon.es/s/ref=sr_pg_2rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1 535314254']
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="productTitle"])').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="priceblock_ourprice"])').extract()
self.item_count += 1
yield ml_item
Do you know why?
I' ve added the code here to do it easily.

You have an indentation error:
# -*- coding: utf-8 -*-
import scrapy
import urllib
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['https://www.amazon.es']
start_urls = ['https://www.amazon.es/s/ref=sr_pg_2rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1 535314254']
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="productTitle"])').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="priceblock_ourprice"])').extract()
self.item_count += 1
yield ml_item
UPDATE But right now you have code (not optimal) to get pagination and parse details page. You need to add code to parse each pagination page and get detail link for each item:
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_search)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_search)
def parse_search(self, response):
for item_link in response.xpath('//ul[#id="s-results-list-atf"]//a[contains(#class, "s-access-detail-page")]/#href').extract():
yield scrapy.Request(item_link, self.parse_item)
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="productTitle"])').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="priceblock_ourprice"])').extract()
self.item_count += 1
yield ml_item

Related

How can I use Google Cloud Functions to run a web scraper?

Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.

An error occurred while creating a bitcoin price notification Telegram chatbot (PytzUsageWarning)

import requests
import telegram
import json
from telegram.ext import Updater, CommandHandler
import time
import sys
import pandas as pd
from apscheduler.schedulers.background import BlockingScheduler
from apscheduler.jobstores.base import JobLookupError
dt = requests.get('https://min-api.cryptocompare.com/data/price?fsym=BTC&tsyms=USD,EUR')
print(dt.text)
price_now = dt.json()
bot_token = "5668522544:AAFqNFcgd5wDBtQbJBhRayfPx9VpVPVjcyQ"
Cointimeline = telegram.Bot(token=bot_token)
updates = Cointimeline.getUpdates()
for i in updates:
print(i.message)
class Chatbot:
def __init__(self, token):
self.core = telegram.Bot(token)
self.updater = Updater(token)
self.id = 5734902861
def sendmsg(self, text):
self.core.sendmsg(chat_id=self.id, text=text)
def stop(self):
self.updater.stop()
class Alert(Chatbot):
def __init__(self):
self.token = '5668522544:AAFqNFcgd5wDBtQbJBhRayfPx9VpVPVjcyQ'
Chatbot.__init__(self, self.token)
self.updater.stop()
def controller(self, cmd, func):
self.updater.dispatcher.controller(CommandHandler(smd, func))
def start(self):
self.sendMessage('')
aps = BlockingScheduler()
def push():
dt = requests.get("https://min-api.cryptocompare.com/data/price?fsym=BTC&tsyms=USD,EUR")
ALERTBOT = Alert()
ALERTBOT.sendmsg(dt.text)a
price_now = pd.DataFrame({"USD": {list(dt.json().values())[0]}, "EUR": [list(dt.json().values())[1]]})
data = pd.read_csv("ALERTBOT.csv")
data = data.append(price_now, sort=True)
data = data.loc[:, 'USD':'EUR']
data.to_csv("ALERTBOT.csv")
aps.add_job(push, 'interval', seconds=60)
aps.start()
The error continues to occur.
PytzUsageWarning: The zone attribute is specific to pytz's interface; please migrate to a new time zone provider. For more details on how to do so, see https://pytz-deprecation-shim.readthedocs.io/en/latest/migration.html
if obj.zone == 'local':
The error is caused by the pytz library being out of date. You can solve the problem by updating the library:
pip install --upgrade pytz

Prices webscraping using BeautifulSoup

Goal: I'm trying to scrape prices
Expected Output: 2 columns 1)productName (OK) 2)price (Not OK, I have NaN)
I tried the following:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import time
urllib3.disable_warnings()
t0 = time.time()
page_proximus = urlopen("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus, 'html.parser')
scrap_list=pd.DataFrame(columns =['Item_name','Item_price'])
url = 'https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html'+ str(page_list)
req = urllib3
res = req.request
soup = BeautifulSoup(page_proximus, 'html.parser')
html = urlopen('https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html').read().decode("utf-8")
bs = BeautifulSoup(html, 'html.parser')
scrap_name = bs.find_all(["h1"])
product_name=pd.DataFrame(scrap_name,columns =['Item_name'])
scrap_price = bs.find_all ("span",{'class': 'rs-unit'})
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
scrap_list=scrap_list.append(pd.concat([product_name['Item_name'], product_price['Item_price']],
axis=1))
t1 = time.time()
r=t1-t0
print(r)
print(scrap_list)
The data is within the <meta> tags.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
t0 = time.time()
page_proximus = requests.get("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus.text, 'html.parser')
rows = []
metaData = soup.find_all('meta',{'property':'og:description'})
for meta in metaData:
row = {'Item_name':meta.find('meta',{'name':'device_model'})['content'],
'Item_price':meta.find('meta',{'name':'device_price'})['content']}
rows.append(row)
t1 = time.time()
r=t1-t0
print(r)
df = pd.DataFrame(rows)
print(df)
Output:
Item_name Item_price
0 iPhone 13 256GB Pink 1029,99

python unittest error and using patch from unittest.mock

I have two python files the first one is named employee.py and this is its code
import requests
class Employee:
"""A sample Employee class"""
raise_amt = 1.05
def __init__(self, first, last, pay):
self.first = first
self.last = last
self.pay = pay
#property
def email(self):
return '{}.{}#email.com'.format(self.first, self.last)
#property
def fullname(self):
return '{} {}'.format(self.first, self.last)
def apply_raise(self):
self.pay = int(self.pay * self.raise_amt)
def monthly_schedule(self, month):
response = requests.get(f'http://company.com/{self.last}/{month}')
if response.ok:
return response.text
else:
return 'Bad Response!'
The other file is named test_employee.py
and this is its code
import unittest
from unittest.mock import patch
from employee import Employee
class TestEmployee(unittest.TestCase):
#classmethod
def setUpClass(cls):
print('setupClass')
#classmethod
def tearDownClass(cls):
print('teardownClass')
def setUp(self):
print('setUp')
self.emp_1 = Employee('Corey', 'Schafer', 50000)
self.emp_2 = Employee('Sue', 'Smith', 60000)
def tearDown(self):
print('tearDown\n')
def test_email(self):
print('test_email')
self.assertEqual(self.emp_1.email, 'Corey.Schafer#email.com')
self.assertEqual(self.emp_2.email, 'Sue.Smith#email.com')
self.emp_1.first = 'John'
self.emp_2.first = 'Jane'
self.assertEqual(self.emp_1.email, 'John.Schafer#email.com')
self.assertEqual(self.emp_2.email, 'Jane.Smith#email.com')
def test_fullname(self):
print('test_fullname')
self.assertEqual(self.emp_1.fullname, 'Corey Schafer')
self.assertEqual(self.emp_2.fullname, 'Sue Smith')
self.emp_1.first = 'John'
self.emp_2.first = 'Jane'
self.assertEqual(self.emp_1.fullname, 'John Schafer')
self.assertEqual(self.emp_2.fullname, 'Jane Smith')
def test_apply_raise(self):
print('test_apply_raise')
self.emp_1.apply_raise()
self.emp_2.apply_raise()
self.assertEqual(self.emp_1.pay, 52500)
self.assertEqual(self.emp_2.pay, 63000)
def test_monthly_schedule(self):
with patch('employee.requests.get') as mocked_get:
mocked_get.return_value.ok = True
mocked_get.return_value.text = 'Success'
schedule = self.emp_1.monthly_schedule('May')
mocked_get.assert_called_with('http://company.com/Schafer/May')
self.assertEqual(schedule,'Success')
if __name__ == '__main__':
unittest.main()
when I run test_employee.py, I get this error
ModuleNotFoundError: No module named 'employee.requests'; 'employee' is not a package
The code runs well if you delete the function test_monthly_schedule from test_employee.py
and delete monthly_schedule from employee.py
also I do not know if that will make a difference, but I use python 3.8. Also, I'm using Mac

How Do I scrape Multiple pages and write data into excel

How to sprape multiple pages in excel.
for example i want to scrape "http://econpy.pythonanywhere.com/ex/001.html"
how to scrape next pages considering number of pages are unknown
Plus i have written a code it prints nonetype in excel but not the data
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1, Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
salmon = print(Header1.get_text())
salam = print(Header2.get_text())
f.write("{}".format(salmon)+ "{}".format(salam))
f.close()
What i am doing wrong?
Give this a try and let me know if you have any issues. I used "css selector" and "requests" for the operation to be accomplished.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('Mynew.csv', 'w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Name","Price"])
for page in range(1,6):
html = requests.get("http://econpy.pythonanywhere.com/ex/00{0}.html".format(page))
soup = BeautifulSoup(html.text, "html.parser")
for item in soup.select("div[title=buyer-info]"):
Header1 = item.select_one("div[title=buyer-name]").get_text()
Header2 = item.select_one("span.item-price").get_text()
writer.writerow([Header1, Header2])
print(Header1,Header2)
outfile.close()
i got it solved till 1st page... and this is the code
from bs4 import BeautifulSoup
from urllib.request import urlopen
page_url = "http://econpy.pythonanywhere.com/ex/001.html"
new_file = "Mynew.csv"
f = open(new_file, "w")
Headers = "Header1,Header2\n"
f.write(Headers)
html = urlopen(page_url)
soup = BeautifulSoup(html, "html.parser")
buyer_info = soup.find_all("div", {"title":"buyer-info"})
for i in buyer_info:
Header1 = i.find("div", {"title":"buyer-name"})
Header2 = i.find("span", {"class":"item-price"})
f.write('{},{}\n'.format(Header1.text, Header2.text))
f.close()
now the pain comes how to spacre for mulitple pages , means how to scrape next pages also?

Resources