Asynchronous Loop mixing meta item - web-scraping

I'm trying to scrape the following website:
https://institucional.xpi.com.br/sobre-a-xp/encontre-um-escritorio/
I have a dropdown list to select a state, from that state it gives me a dropdown list from the cities available.
After submitting, I get the list of all offices (address,e-mail,phone number) for that city.
With this code, I'm not getting all the results and also I'm getting repeated city names, it looks like the meta item is mixing from one loop with the other.
I tried to debug, but here is what happens:
I start the first parse function, when I enter the loop for each state, I get the first state ("AC") when I arrive at the yield line, I expected it to go to the parseStates function but it starts the loop again.
The thing is, it doesn't do the whole loop, it loops through the first five states, then it jumps the the parseStates function.
def parse(self, response):
statesList = ["AC","AL","AM","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE","PR","RJ","RN","RO","RS","SC","SE","SP"]
for state in statesList:
linkState = 'https://institucional.xpi.com.br/api/Escritorios/FilialListarCidadesV2?vSiglaEstado=' + state
location = LocationItem()
location['state']=state
yield scrapy.Request(url=linkState, callback=self.parseStates, meta={'item':location})
def parseStates(self,response):
location=response.meta['item']
root = ET.fromstring(response.body)
cityList = [city.text for city in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}Nome')]
for city in cityList:
location['city']=city
state = location['state']
linkCity = 'https://institucional.xpi.com.br/api/Escritorios/FilialListarPorEstadoCidadeV2?vSiglaEstado=' + state + '&vNomeCidade='+city.replace(' ','%20')
yield scrapy.Request(url=linkCity, callback=self.parseCities,meta={'item':location})
def parseCities(self,response):
location = response.meta['item']
state = location['state']
city = location['city']
root = ET.fromstring(response.body)
mailList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}EmailPadronizadoSocioResponsavel')]
companyList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}RazaoSocial')]
contactList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}SocioResponsavel')]
telList = [elem.text for elem in root.iter('{http://schemas.datacontract.org/2004/07/XP.Portal.Entities}Telefone')]
for i in range(len(mailList)):
write(state,city,companyList[i],contactList[i],mailList[i],telList[i])

Related

Web Scraping: How do I return specific user input forms in python?

I'm having trouble with the forms returning an exact match for the user input.
Emphasoft developer challenge:
Taking a list of tax form names (ex: "Form W-2", "Form 1095-C"),
search the website and return some informational results.
Specifically, you must return the "Product Number", the "Title", and
the maximum and minimum years the form is available for download.
Taking a tax form name (ex: "Form W-2") and a range of years
(inclusive, 2018-2020 should fetch three years), download all PDFs
available within that range.
import json import os import sys import requests from bs4 import BeautifulSoup
URL = 'https://apps.irs.gov/app/picklist/list/priorFormPublication.html?resultsPerPage=200&sortColumn=sortOrder&indexOfFirstRow=0&{param.strip}&isDescending=false'
def get_forms(list_tax_form: list):
"""
function to get response from iris.gov with all forms content
:param list_tax_form: list of form names that we want to get info about
:return: dict with form name,form title
"""
response_list = [] # list for all responses of form names
with requests.session() as session:
for param in list_tax_form:
request_params = {'value': param,
'criteria': 'formNumber',
'submitSearch': 'Find',
}
res = session.get(URL, params=request_params).content
response_list.append(res)
return response_list
def parse_responses(list_tax_form: list):
"""
function to get all form names, titles years from previous func return
:param list_tax_form: list of form names that we want to get info about
:return: list of form names, titles, years
"""
responses = get_forms(list_tax_form)
# empty lists to fill them with the received information for all names, years, and titles
td_form_name, td_form_title, td_form_rev_year = [], [], []
for response in responses:
soup = BeautifulSoup(response, 'lxml')
td_name = soup.find_all('td', {'class': 'LeftCellSpacer'})
td_title = soup.find_all('td', {'class': 'MiddleCellSpacer'})
td_rev_year = soup.find_all('td', {'class': 'EndCellSpacer'})
td_form_name.extend(td_name)
td_form_title.extend(td_title)
td_form_rev_year.extend(td_rev_year)
return td_form_name, td_form_title, td_form_rev_year
def format_responses(list_tax_form: list):
"""
function to formate all responses for all forms we got!
1 Task
:param list_tax_form: list of form names that we want to get info about
:return: formated names,links,years
"""
td_names, td_titles, td_years = parse_responses(list_tax_form)
names = [name.text.strip() for name in td_names]
links = [link.find('a')['href'] for link in td_names]
titles = [title.text.strip() for title in td_titles]
years = [int(year.text.strip()) for year in td_years]
set_names = set(names)
final_dict = []
# loop to create dictionary of result information with years of tax form available to download
for name in set_names:
max_year = 0
min_year = max(years)
dict1 = {'form_number': name}
for index, p_name in enumerate(names):
if p_name == name:
if years[index] > max_year:
max_year = years[index]
elif years[index] < min_year:
min_year = years[index]
dict1['form_title'] = titles[index]
dict1['max_year'] = max_year
dict1['min_year'] = min_year
final_dict.append(dict1)
print(json.dumps(final_dict, indent=2))
return names, links, years
def download_files(list_tax_form):
"""
2 Task
Module to download pdf files of form_name that input from user.
:param list_tax_form: list of form names that we want to get info about
:return: message to user of successful create file or either
"""
names, links, years = format_responses(list_tax_form)
form_name = input('enter form name: ')
if form_name in names:
print('form exists. enter years range')
form_year1 = int(input('start year to analysis: '))
form_year2 = int(input('end year to analysis: '))
try:
os.mkdir(form_name)
except FileExistsError:
pass
# indecies to define names range in list of all tax form names
r_index = names.index(form_name) # index of first form_name mention on list
l_index = names.index(form_name) # index of last form_name mention on list
for name in names:
if name == form_name:
r_index += 1
years = years[l_index:r_index]
if form_year1 < form_year2:
range_years = range(form_year1, form_year2 + 1)
for year in range_years:
if year in years:
link = links[years.index(year)]
form_file = requests.get(link, allow_redirects=True)
open(f'{form_name}/{form_name}_{str(year)}.pdf', 'wb').write(form_file.content)
print(f'files saved to {form_name}/ directory!')
else:
print('input correct form name!')
if __name__ == '__main__':
tax_list = sys.argv[1:] # form names
download_files(tax_list)
(ex: "Form W-2" should not return "Form W-2 P")
When this file is ran, it is displaying other unrelated results.
How can I resolve this issue to display only specified user requests?

The New York Times API with R

I'm trying to get articles' information using The New York Times API. The csv file I get doesn't reflect my filter query. For example, I restricted the source to 'The New York Times', but the file I got contains other sources also.
I would like to ask you why the filter query doesn't work.
Here's the code.
if (!require("jsonlite")) install.packages("jsonlite")
library(jsonlite)
api = "apikey"
nytime = function () {
url = paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?',
'&fq=source:',("The New York Times"),'AND type_of_material:',("News"),
'AND persons:',("Trump, Donald J"),
'&begin_date=','20160522&end_date=','20161107&api-key=',api,sep="")
#get the total number of search results
initialsearch = fromJSON(url,flatten = T)
maxPages = round((initialsearch$response$meta$hits / 10)-1)
#try with the max page limit at 10
maxPages = ifelse(maxPages >= 10, 10, maxPages)
#creat a empty data frame
df = data.frame(id=as.numeric(),source=character(),type_of_material=character(),
web_url=character())
#save search results into data frame
for(i in 0:maxPages){
#get the search results of each page
nytSearch = fromJSON(paste0(url, "&page=", i), flatten = T)
temp = data.frame(id=1:nrow(nytSearch$response$docs),
source = nytSearch$response$docs$source,
type_of_material = nytSearch$response$docs$type_of_material,
web_url=nytSearch$response$docs$web_url)
df=rbind(df,temp)
Sys.sleep(5) #sleep for 5 second
}
return(df)
}
dt = nytime()
write.csv(dt, "trump.csv")
Here's the csv file I got.
It seems you need to put the () inside the quotes, not outside. Like this:
url = paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?',
'&fq=source:',"(The New York Times)",'AND type_of_material:',"(News)",
'AND persons:',"(Trump, Donald J)",
'&begin_date=','20160522&end_date=','20161107&api-key=',api,sep="")
https://developer.nytimes.com/docs/articlesearch-product/1/overview

Why is my vector not accumulating the iteration?

I have this type of data that I want to send to a dataframe:
So, I am iterating through it and sending it to a vector. But my vector never keeps the data.
Dv = Vector{Dict}()
for item in reader
push!(Dv,item)
end
length(Dv)
This is what I get:
And I am sure this is the right way to do it. It works in Python:
EDIT
This is the code that I use to access the data that I want to send to a dataframe:
results=pyimport("splunklib.results")
kwargs_oneshot = (earliest_time= "2019-09-07T12:00:00.000-07:00",
latest_time= "2019-09-09T12:00:00.000-07:00",
count=0)
searchquery_oneshot = "search index=iis | lookup geo_BST_ONT longitude as sLongitude, latitude as sLatitude | stats count by featureId | geom geo_BST_ONT allFeatures=True | head 2"
oneshotsearch_results = service.jobs.oneshot(searchquery_oneshot; kwargs_oneshot...)
# Get the results and display them using the ResultsReader
reader = results.ResultsReader(oneshotsearch_results)
for item in reader
println(item)
end
ResultsReader is a streaming reader. This means you will "consume" its elements as you iterate over them. You can covert it to an array with collect. Do not print the items before you collect.
results=pyimport("splunklib.results")
kwargs_oneshot = (earliest_time= "2019-09-07T12:00:00.000-07:00",
latest_time= "2019-09-09T12:00:00.000-07:00",
count=0)
searchquery_oneshot = "search index=iis | lookup geo_BST_ONT longitude as sLongitude, latitude as sLatitude | stats count by featureId | geom geo_BST_ONT allFeatures=True | head 2"
oneshotsearch_results = service.jobs.oneshot(searchquery_oneshot; kwargs_oneshot...)
# Get the results
reader = results.ResultsReader(oneshotsearch_results)
# collect them into an array
Dv = collect(reader)
# Now you can iterate over them without changing the result
for item in Dv
println(item)
end

IndexError: list index out of range, scores.append( (fields[0], fields[1]))

I'm trying to read a file and put contents in a list. I have done this mnay times before and it has worked but this time it throws back the error "list index out of range".
the code is:
with open("File.txt") as f:
scores = []
for line in f:
fields = line.split()
scores.append( (fields[0], fields[1]))
print(scores)
The text file is in the format;
Alpha:[0, 1]
Bravo:[0, 0]
Charlie:[60, 8, 901]
Foxtrot:[0]
I cant see why it is giving me this problem. Is it because I have more than one value for each item? Or is it the fact that I have a colon in my text file?
How can I get around this problem?
Thanks
If I understand you well this code will print you desired result:
import re
with open("File.txt") as f:
# Let's make dictionary for scores {name:scores}.
scores = {}
# Define regular expressin to parse team name and team scores from line.
patternScore = '\[([^\]]+)\]'
patternName = '(.*):'
for line in f:
# Find value for team name and its scores.
fields = re.search(patternScore, line).groups()[0].split(', ')
name = re.search(patternName, line).groups()[0]
# Update dictionary with new value.
scores[name] = fields
# Print output first goes first element of keyValue in dict then goes keyName
for key in scores:
print (scores[key][0] + ':' + key)
You will recieve following output:
60:Charlie
0:Alpha
0:Bravo
0:Foxtrot

Scrapy recursive crawling with max DEPTH - Infinite loop

i'm new with scrapy, i'm trying to build a spider that must do this kind work:
Extract all links from a generic web page recursively and with a specific depth.
I'm tryng to do this with following code:
class MySpider(CrawlSpider):
settings.overrides['DEPTH_LIMIT'] = 1
name = "cnet"
allowed_domains = ["cnet.com"]
start_urls = ["http://www.cnet.com/"]
rules = (Rule (SgmlLinkExtractor(allow_domains=('cnet.com',)), callback="parse_items", follow= True),)
def parse_items(self, response):
print ""
print "PARSE ITEMS"
print ""
hxs = HtmlXPathSelector(response)
titles = hxs.select('//a')
items = []
for titles in titles:
item = NewsItem()
item ["title"] = titles.select("text()").extract()
item ["link"] = titles.select("#href").extract()
if(len(item["link"]) > 0) and (self.allowed_domains[0] in item["link"][0]):
print ""
print response.meta['depth']
print item ["title"]
print item ["link"]
print ""
items.append(item)
return(items)
But it seems to go on an INFINITE loop, Any suggestion?
Thanks a lot!

Resources