Scrapy recursive crawling with max DEPTH - Infinite loop - recursion

i'm new with scrapy, i'm trying to build a spider that must do this kind work:
Extract all links from a generic web page recursively and with a specific depth.
I'm tryng to do this with following code:
class MySpider(CrawlSpider):
settings.overrides['DEPTH_LIMIT'] = 1
name = "cnet"
allowed_domains = ["cnet.com"]
start_urls = ["http://www.cnet.com/"]
rules = (Rule (SgmlLinkExtractor(allow_domains=('cnet.com',)), callback="parse_items", follow= True),)
def parse_items(self, response):
print ""
print "PARSE ITEMS"
print ""
hxs = HtmlXPathSelector(response)
titles = hxs.select('//a')
items = []
for titles in titles:
item = NewsItem()
item ["title"] = titles.select("text()").extract()
item ["link"] = titles.select("#href").extract()
if(len(item["link"]) > 0) and (self.allowed_domains[0] in item["link"][0]):
print ""
print response.meta['depth']
print item ["title"]
print item ["link"]
print ""
items.append(item)
return(items)
But it seems to go on an INFINITE loop, Any suggestion?
Thanks a lot!

Related

How can I use Google Cloud Functions to run a web scraper?

Thanks in advance for your help.
I'm currently running a webscraper - this is the first time I've ever done something like this - It pulls addresses from the URL and then will match the address to the users input. This will be going into a chat bot, I wondering how I can make this run on Google Functions. Whats the process to do this, is there a tutorial anywhere?
This is my code so far. There is a small items file too
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy
class AddressesSpider(scrapy.Spider):
name = 'Addresses'
allowed_domains = ['find-energy-certificate.service.gov.uk']
postcode = "bh10+4ah"
start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
## def start_requests(self):
## self.first = input("Please enter the address you would like to match: ")
## yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
first = input("Please enter the address you would like to match: ")
highest_ratios = []
highest_item = None
for row in response.xpath('//table[#class="govuk-table"]//tr'):
address = row.xpath("normalize-space(.//a[#class='govuk-link']/text())").extract()[0].lower()
address = address.rsplit(',', 2)[0]
link = row.xpath('.//a[#class="govuk-link"]/#href').extract()
details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()
ratio = fuzz.token_set_ratio(address, first)
item = DataItem()
item['link'] = link
item['details'] = details
item['address'] = address
item['ratioresult'] = ratio
if len(highest_ratios) < 3:
highest_ratios.append(item)
elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
highest_ratios.append(item)
highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
if highest_ratios_100:
for item in highest_ratios_100:
yield item
else:
yield max(highest_ratios, key=lambda x: x['ratioresult'])
if len(highest_ratios_100) > 1:
for i, item in enumerate(highest_ratios_100):
print(f"{i+1}: {item['address']}")
selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
selected_item = highest_ratios_100[selected]
else:
selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])
new_url = selected_item['link'][0]
new_url = str(new_url)
if new_url:
base_url = 'https://find-energy-certificate.service.gov.uk'
print(f'Base URL: {base_url}')
print(f'New URL: {new_url}')
new_url = urljoin(base_url, new_url)
print(f'Combined URL: {new_url}')
yield scrapy.Request(new_url, callback=self.parse_new_page)
def parse_new_page(self, response):
Postcode = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()])').extract()
Town = response.xpath('normalize-space((//p[#class="epc-address govuk-body"]/text())[last()-1])').extract()
First = response.xpath(".//p[#class='epc-address govuk-body']").extract()
Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
Expiry = response.xpath('normalize-space(//b)').extract_first()
FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
yield {
'Postcode': Postcode,
'Town': Town,
'First': First,
'Type': Type,
'Walls': Walls,
'Roof': Roof,
'Heating': Heating,
'CurrentScore': CurrentScore,
'Maxscore': Maxscore,
'Expiry': Expiry,
'FloorArea': FloorArea,
'Steps': Steps
}
I've tried googling and having a look around and can't get how to deploy this as a project to run on google functions or can I just copy the code into the console somewhere?
You can try running your spider from a script. However, a better solution would be to wrap scrapy in its own child process.
For example:
from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def my_cloud_function(event, context):
def script(queue):
try:
settings = get_project_settings()
settings.setdict({
'LOG_LEVEL': 'ERROR',
'LOG_ENABLED': True,
})
process = CrawlerProcess(settings)
process.crawl(MySpider)
process.start()
queue.put(None)
except Exception as e:
queue.put(e)
queue = Queue()
# wrap the spider in a child process
main_process = Process(target=script, args=(queue,))
main_process.start() # start the process
main_process.join() # block until the spider finishes
result = queue.get() # check the process did not return an error
if result is not None:
raise result
return 'ok'
You can refer to this tutorial for more info.

Asynchronous Loop mixing meta item

I'm trying to scrape the following website:
https://institucional.xpi.com.br/sobre-a-xp/encontre-um-escritorio/
I have a dropdown list to select a state, from that state it gives me a dropdown list from the cities available.
After submitting, I get the list of all offices (address,e-mail,phone number) for that city.
With this code, I'm not getting all the results and also I'm getting repeated city names, it looks like the meta item is mixing from one loop with the other.
I tried to debug, but here is what happens:
I start the first parse function, when I enter the loop for each state, I get the first state ("AC") when I arrive at the yield line, I expected it to go to the parseStates function but it starts the loop again.
The thing is, it doesn't do the whole loop, it loops through the first five states, then it jumps the the parseStates function.
def parse(self, response):
statesList = ["AC","AL","AM","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE","PR","RJ","RN","RO","RS","SC","SE","SP"]
for state in statesList:
linkState = 'https://institucional.xpi.com.br/api/Escritorios/FilialListarCidadesV2?vSiglaEstado=' + state
location = LocationItem()
location['state']=state
yield scrapy.Request(url=linkState, callback=self.parseStates, meta={'item':location})
def parseStates(self,response):
location=response.meta['item']
root = ET.fromstring(response.body)
cityList = [city.text for city in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}Nome')]
for city in cityList:
location['city']=city
state = location['state']
linkCity = 'https://institucional.xpi.com.br/api/Escritorios/FilialListarPorEstadoCidadeV2?vSiglaEstado=' + state + '&vNomeCidade='+city.replace(' ','%20')
yield scrapy.Request(url=linkCity, callback=self.parseCities,meta={'item':location})
def parseCities(self,response):
location = response.meta['item']
state = location['state']
city = location['city']
root = ET.fromstring(response.body)
mailList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}EmailPadronizadoSocioResponsavel')]
companyList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}RazaoSocial')]
contactList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}SocioResponsavel')]
telList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}Telefone')]
for i in range(len(mailList)):
write(state,city,companyList[i],contactList[i],mailList[i],telList[i])

Why does WTForms have unbound fields the first time the page is loaded?

I have the following form classes:
class FieldsRequiredForm(FlaskForm):
class Meta:
def render_field(self, field, render_kw):
render_kw.setdefault('required', True)
return super().render_field(field, render_kw)
class SingleStringFieldForm(FieldsRequiredForm):
def __init__(self, field_label=None, question_id=None,
submit_label='Submit'):
super().__init__()
SingleStringFieldForm.answer = StringField(field_label)
SingleStringFieldForm.question_id = HiddenField(default=question_id)
SingleStringFieldForm.submit = SubmitField(submit_label)
class SingleRadioFieldForm(FieldsRequiredForm):
def __init__(self, field_label=None, question_id=None,
submit_label='Submit', choices=None):
super().__init__()
SingleRadioFieldForm.answer = RadioField(field_label, choices=choices)
SingleRadioFieldForm.question_id = HiddenField(default=question_id)
SingleRadioFieldForm.submit = SubmitField(submit_label)
The function that's using these forms looks like this:
#bp.route('/survey/<string:slug>', methods=['GET', 'POST'])
def question(slug):
survey = Survey.query.filter_by(slug=slug).first_or_404()
questions = survey.questions
question_ids = [question.id for question in questions]
if 'answers' not in session:
session['answers'] = json.dumps({id: None for id in question_ids})
answers = json.loads(session['answers'])
if request.method == 'POST':
record_submitted_answer()
answers = json.loads(session['answers'])
if None in answers.values():
question = get_next_question()
if question.category == 'word':
form = SingleStringFieldForm(field_label=question.question,
question_id=question.id)
elif question.category == 'likert':
form = SingleRadioFieldForm(field_label=question.question,
question_id=question.id,
choices=tuple(likert().items()))
else:
form = SingleStringFieldForm()
if form.validate_on_submit():
if None not in answers.values():
write_answers_to_database(survey=survey)
with open('app/static/ty.txt', 'r') as f:
ty = [x.strip() for x in f.readlines()]
return render_template('ty.html', ty=ty)
return redirect(url_for('survey.question', slug=slug))
return render_template('survey.html', form=form, answers=answers)
The first time I load the page after clearing the session, the form doesn't show up, and when I'm stepping through with a debugger when that happens, the debugger reports that form.answer has a value of:
<UnboundField(RadioField, ('Question 1',), {'choices': (('1', 'Strongly Agree'),
('2', 'Agree'), ('3', 'Neutral'), ('4', 'Disagree'), ('5', 'Strongly Disagree'))})>
If I reload the page, it has this value:
<app.survey.forms.SingleRadioFieldForm object at 0x110788d30>
I don't notice anything else different in the state of the program between the two page loads.
What is causing this the first time and how can I fix it?
While I still don't know why the original function wasn't working on first page load, but did on a reload, I went about this in another way and solved the problem. Instead of setting the fields within __init__, I subclassed within the function and edited the class directly.
#bp.route('/survey/<string:slug>', methods=['GET', 'POST'])
def survey(slug):
class F(FieldsRequiredForm):
pass
...
if None in answers.values():
question = get_next_question()
F.question_id = HiddenField(default=question.id)
if question.category == 'word':
F.answer = StringField(question.question)
elif question.category == 'likert':
F.answer = RadioField(question.question, choices=tuple(likert().items()))
F.submit = SubmitField('Submit')
else:
F.answer = StringField()
form = F()
...

how to iterate over multiple links and scrape everyone of them one by one and save the output in csv using python beautifulsoup and requests

I have this code but don't know how to read the links from a CSV or a list. I want to read the links and scrape details off every single link and then save the data in columns respected to each link into an output CSV.
Here is the code I built to get specific data.
from bs4 import BeautifulSoup
import requests
url = "http://www.ebay.com/itm/282231178856"
r = requests.get(url)
x = BeautifulSoup(r.content, "html.parser")
# print(x.prettify().encode('utf-8'))
# time to find some tags!!
# y = x.find_all("tag")
z = x.find_all("h1", {"itemprop": "name"})
# print z
# for loop done to extracting the title.
for item in z:
try:
print item.text.replace('Details about ', '')
except:
pass
# category extraction done
m = x.find_all("span", {"itemprop": "name"})
# print m
for item in m:
try:
print item.text
except:
pass
# item condition extraction done
n = x.find_all("div", {"itemprop": "itemCondition"})
# print n
for item in n:
try:
print item.text
except:
pass
# sold number extraction done
k = x.find_all("span", {"class": "vi-qtyS vi-bboxrev-dsplblk vi-qty-vert-algn vi-qty-pur-lnk"})
# print k
for item in k:
try:
print item.text
except:
pass
# Watchers extraction done
u = x.find_all("span", {"class": "vi-buybox-watchcount"})
# print u
for item in u:
try:
print item.text
except:
pass
# returns details extraction done
t = x.find_all("span", {"id": "vi-ret-accrd-txt"})
# print t
for item in t:
try:
print item.text
except:
pass
#per hour day view done
a = x.find_all("div", {"class": "vi-notify-new-bg-dBtm"})
# print a
for item in a:
try:
print item.text
except:
pass
#trending at price
b = x.find_all("span", {"class": "mp-prc-red"})
#print b
for item in b:
try:
print item.text
except:
pass
Your question is kind of vague!
Which links are you talking about? There are a hundred on a single ebay page. Which infos would you like to scrape? Similarly there is also a ton.
But anyway, here is I would proceed:
# First, create a list of urls you want to iterate on
urls = []
soup = (re.text, "html.parser")
# Assuming your links of interests are values of "href" attributes within <a> tags
a_tags = soup.find_all("a")
for tag in a_tags:
urls.append(tag["href"])
# Second, start to iterate while storing the info
info_1, info_2 = [], []
for link in urls:
# Do stuff here, maybe its time to define your existing loops as functions?
info_a, info_b = YourFunctionReturningValues(soup)
info_1.append(info_a)
info_2.append(info_b)
Then if you want a nice csv output:
# Don't forget to import the csv module
with open(r"path_to_file.csv", "wb") as my_file:
csv_writer = csv.writer(final_csv, delimiter = ",")
csv_writer.writerows(zip(urls, info_1, info_2, info_3))
Hope this will help?
Of course, don't hesitate to give additional info, so to have additional details
On attributes with BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attributes
About the csv module: https://docs.python.org/2/library/csv.html

Groovy GroupBy field with and without white spaces

I have invoices list as below
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE_NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
I am trying to do groupBy on the above collection with INVOICE_NUMBER and trying to achieve map with INVOICE_NUMBER and lines as values, below code does it
def lines = invoices*.split('\\|').findAll{ it.size()>1 }
def heads = lines.first()
def invoiceMap = lines.tail().collect{ [heads, it].transpose().collectEntries() }.groupBy{ it.INVOICE_NUMBER }
If I print invoiceMap I get what I intended as below map
[INV-Error_Test1:[[LINE:1, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test1, INVOICE_TOTAL:22],
[LINE:2, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test1, INVOICE_TOTAL:24]],
INV-Error_Test2:[[LINE:3, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test2, INVOICE_TOTAL:26],
[LINE:4, INVOICE_DATE:20150301, INVOICE_NUMBER:INV-Error_Test2, INVOICE_TOTAL:28,]]
]
But but if the INVOICE_NUMBER has any white spaces with it in the invoices map my code doesnt work. Can someone help me to make my code work with white spaces on INVOICE_NUMBER?
Use a proper CSV parser, rather than rolling your own.
#Grab('com.xlson.groovycsv:groovycsv:1.0')
import static com.xlson.groovycsv.CsvParser.parseCsv
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE_NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
def data = parseCsv(invoices.drop(1).join('\n'), separator:'|')
def invoiceMap = data.collect().groupBy { it.INVOICE_NUMBER }
Or with a space in the column title:
def invoices = [
'LEDES98BI V2',
'LINE|INVOICE_DATE|INVOICE NUMBER|INVOICE_TOTAL',
'1|20150301|INV-Error_Test1|22',
'2|20150301|INV-Error_Test1|24',
'3|20150301|INV-Error_Test2|26',
'4|20150301|INV-Error_Test2|28,']
def data = parseCsv(invoices.drop(1).join('\n'), separator:'|')
def invoiceMap = data.collect().groupBy { it.'INVOICE NUMBER' }
You just need to quote your name, like this
def invoiceMap = lines.tail().collect{ [heads, it].transpose().collectEntries() }.groupBy{ it.'INVOICE NUMBER' }

Resources