Scrapy: export parsed data into multiple files - web-scraping

Id like to parse pages and then export certain items to one csv file and other to another file:
using feed exports here I managed to do it for one file as follows:
settings
FEED_EXPORT_FIELDS = (
'url',
'group_url',
'name',
'streetAddress',
'city',
'addressCountry',
)
FEED_FORMAT = 'csv'
FEED_URI = 'output/%(name)s_%(time)s.csv'
But as I said the above exports to only one csv file.
Id like to be able to scrape other fields to another file:
FEED_EXPORT_FIELDS = (
'employee',
'employee_group',
)
my scraper parse:
def parse(self, response):
l = BasicItemLoader(item=ProductItemLoader(), response=response)
l.default_input_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
# l.default_output_processor = Compose(TakeFirst())
l.add_value('url', response.request.url)
l.add_value('group_url', response.meta.get('section', ''))
l.add_css('name', 'h1[itemprop="name"]::text')
l.add_css('streetAddress', "div[itemprop=\"address\"] [itemprop=\"streetAddress\"]::text")
l.add_css('city', "div[itemprop=\"address\"]>p::text")
l.add_css('addressCountry', "div[itemprop=\"address\"] [itemprop=\"addressCountry\"]::text")
l.add_css('phone', ".phoneCompany>input[id*='freePhone']::attr(value)", TakeFirst())
l.add_css('summary', 'span[itemprop="description"]::text')
l.add_xpath('year', "//td[contains(text(),'Year established')]/following-sibling::td/text()")
l.add_xpath('registry', "//td[contains(text(),'Registry of commerce')]/following-sibling::td/text()")
l.add_xpath('legal_form', "//td[contains(text(),'Legal form')]/following-sibling::td/text()")
l.add_xpath('vat', "//td[contains(text(),'VAT')]/following-sibling::td/text()")
l.add_xpath('fax', "//td[contains(text(),'Fax')]/following-sibling::td/text()")
l.add_css('website', "[id*='webSite_presentation_']::text")
l.add_css('brands', "#company-tradenames .tradeName::text")
l.add_xpath('banks', "//h3[contains(text(),'Banks')]/following-sibling::div//strong/text()")
l.add_css('export_area', "#exportZones>span:nth-of-type(2)::text")
l.add_css('import_area', "#importZones>span:nth-of-type(2)::text")
l.add_css('export_countries', "#exportCountries>span:nth-of-type(2)::text")
l.add_css('import_countries', "#importCountries>span:nth-of-type(2)::text")
l.add_css('employees', ".employees.bloc .effectif p::text")
l.add_css('turn_over', ".turnover.bloc li:nth-of-type(1)>p:nth-of-type(2)::text")
return l.load_item()
and items definition
class ProductItemLoader(scrapy.Item):
url = scrapy.Field()
group_url = scrapy.Field()
name = scrapy.Field()
streetAddress = scrapy.Field()
addressCountry = scrapy.Field()
city = scrapy.Field()
phone = scrapy.Field()
summary = scrapy.Field()
year = scrapy.Field()
registry = scrapy.Field()
legal_form = scrapy.Field()
vat = scrapy.Field()
fax = scrapy.Field()
website = scrapy.Field()
brands = scrapy.Field()
banks = scrapy.Field()
import_area = scrapy.Field()
import_countries = scrapy.Field()
export_area = scrapy.Field()
export_countries = scrapy.Field()
employees = scrapy.Field()
turn_over = scrapy.Field()

You will have to use your items definition to achieve saving different fields to their own csv files.
items.py:
import scrapy
class ProductItemLoader(scrapy.Item):
url = scrapy.Field()
group_url = scrapy.Field()
name = scrapy.Field()
streetAddress = scrapy.Field()
addressCountry = scrapy.Field()
city = scrapy.Field()
phone = scrapy.Field()
summary = scrapy.Field()
year = scrapy.Field()
registry = scrapy.Field()
legal_form = scrapy.Field()
vat = scrapy.Field()
class EmployeeLoader(scrapy.Item):
fax = scrapy.Field()
website = scrapy.Field()
brands = scrapy.Field()
banks = scrapy.Field()
import_area = scrapy.Field()
import_countries = scrapy.Field()
export_area = scrapy.Field()
export_countries = scrapy.Field()
employees = scrapy.Field()
turn_over = scrapy.Field()
pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# The CSV file names are used (imported) from the scrapy spider.
return type(item)
class YourSitePipelineHere(object):
# For simplicity, I'm using the same class def names as found in the,
# main scrapy spider and as defined in the items.py
fileNamesCsv = ['ProductItemLoader','EmployeeLoader']
def __init__(self):
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/projec_name"+name+'.csv','wb')) for name in self.fileNamesCsv ])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'ProductItemLoader':
self.exporters[name].fields_to_export = ['url','group_url','name','streetAddress','addressCountry','city','phone','summary','year','registry','legal_form','vat']
self.exporters[name].start_exporting()
if name == 'EmployeeLoader':
self.exporters[name].fields_to_export = ['fax','website','brands','bank','import_area','import_countries','export_area','export_countries','employees','turn_over']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
NeilR

#items.py
import scrapy
class JnuItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
#pipelines.py
from itemadapter import ItemAdapter
from scrapy.exporters import CsvItemExporter
class SeminarPipeline:
def __init__(self):
self.file = None
def open_spider(self,spider):
self.files={}
def close_spider(self,spider):
for exporter in self.files.values():
exporter.finish_exporting()
def file_name (self,item):
adopter = ItemAdapter(item)
title = adopter['title']
string = str(title).lower()
if 'webinar' in string:
exporter = CsvItemExporter(open('webinar7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title','link' ]
exporter.start_exporting()
self.files['webinar']=exporter
return self.files['webinar']
elif 'workshop' in string:
exporter = CsvItemExporter(open('workshop7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title', 'link']
exporter.start_exporting()
self.files['workshop'] = exporter
return self.files['workshop']
elif 'conference' in string:
exporter = CsvItemExporter(open('conference7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title', 'link']
exporter.start_exporting()
self.files['conference'] = exporter
return self.files['conference']
def process_item(self, item, spider):
exporter = self.file_name(item)
exporter.export_item(item)
return item
#settings.py
ITEM_PIPELINES = {'seminar.pipelines.SeminarPipeline': 300,}

Related

How to pass username or email in different class with kivymd and firebase

How can I pass the username or email in different class?
I want to grab the current user's username or email to my firebase and display it in the class detection
here is the code :
# for Home page
class Login_screen(MDScreen):
# sign up button
def go_to_signup(self , *args):
self.manager.current = 'signup_screen'
# clear log in text
def clear_txt_login(self):
self.ids.login_username.text = ''
self.ids.login_pass.text = ''
# button to log in
def log_in(self):
id_username_login = self.ids.login_username.text
id_pass_login = self.ids.login_pass.text
self.login_check = False
supported_loginPassword = id_pass_login.replace('-','.')
request = requests.get(url+'?auth='+auth_firebase) # firebase url and auth
data = request.json()
emails= set()
for key,value in data.items():
emails.add(key)
if id_username_login in emails and supported_loginPassword == data[id_username_login]['Password']:
self.login_check=True
self.manager.current = 'detection'
print(id_username_login)
else:
cancel_btn_username_dialogue = MDFlatButton(text = 'ok',on_release = self.close_dialog)
self.dialog = MDDialog(title = 'Not Found',text = 'Invalid not Found',size_hint = (0.7,0.2),buttons = [cancel_btn_username_dialogue])
self.dialog.open()
def show_password(self,value):
if self.ids.login_pass.password == True :
self.ids.login_pass.password = False
self.ids.show_txt_password.text = 'Hide Password'
else:
self.ids.login_pass.password = True
self.ids.show_txt_password.text = 'Show Password'
def close_dialog(self,obj):
self.dialog.dismiss()
class Signup_screen(MDScreen):
# log in button
def go_to_login(self , *args):
self.manager.current = 'login_screen'
# clear sign up text
def clear_txt_signup(self):
self.ids.signup_username.text = ''
self.ids.signup_email.text = ''
self.ids.signup_pass.text = ''
# button to sign up
def register(self):
id_username_signup = self.ids.signup_username.text
id_email_signup = self.ids.signup_email.text
id_pass_signup = self.ids.signup_pass.text
empty_fields = id_username_signup.split() == [] or id_email_signup.split() == [] or id_pass_signup.split() == []
if empty_fields:
cancel_btn_username_dialogue = MDFlatButton(text = 'Retry',on_release = self.close_dialog)
self.dialog = MDDialog(title = 'Invalid Input',text = 'Please Enter a valid Input',size_hint = (0.7,0.2),buttons = [cancel_btn_username_dialogue])
self.dialog.open()
elif len(id_username_signup.split())>1:
cancel_btn_username_dialogue = MDFlatButton(text = 'ok',on_release = self.close_dialog)
self.dialog = MDDialog(title = 'Invalid Student ID',text = 'Please Enter Student number without space',size_hint = (0.7,0.2),buttons = [cancel_btn_username_dialogue])
self.dialog.open()
elif '#' not in id_email_signup:
cancel_btn_username_dialogue = MDFlatButton(text = 'ok',on_release = self.close_dialog)
self.dialog = MDDialog(title = 'Invalid Email',text = 'Please Enter a valid email forman',size_hint = (0.7,0.2),buttons = [cancel_btn_username_dialogue])
self.dialog.open()
else:
signup_info = str({f'\"{id_username_signup}\":{{"Password":\"{id_pass_signup}\","Email":\"{id_email_signup}\"}}'})
signup_info = signup_info.replace("\'","")
to_database = json.loads(signup_info)
requests.patch(url = url,json = to_database) # firebase url
self.manager.current = 'detection'
def show_password(self,value):
if self.ids.signup_pass.password == True :
self.ids.signup_pass.password = False
self.ids.show_txt_password.text = 'Hide Password'
else:
self.ids.signup_pass.password = True
self.ids.show_txt_password.text = 'Show Password'
def close_dialog(self,obj):
self.dialog.dismiss()
class Detect(MDScreen):
pass
class Application(MDApp):
def build(self):
self.icon = 'assets/icons/icon.png'
self.title = "Student Log In"
# theme/s of the app
self.theme_cls.primary_palette = "Blue"
self.theme_cls.theme_style = "Light"
sm = MDScreenManager(transition= MDSlideTransition())
sm.add_widget(Splash_screen(name ='splash_screen'))
sm.add_widget(Login_screen(name='login_screen'))
sm.add_widget(Signup_screen(name='signup_screen'))
sm.add_widget(Detect(name='detection'))
return sm
some how i can grab the username or email by printing this in:
if id_username_login in emails and supported_loginPassword == data[id_username_login]['Password']:
self.login_check=True
self.manager.current = 'detection'
print(id_username_login)
Now that i want to grab the id of the text input in the log in or sign-up class or username of the current user in my firebase and use this as 'WELCOME USERNAME' format.

Unable to reconnect to a socket

I am trying to connect to a socket using a client that is running Tkinter. The script runs a banking app allowing users to log in. The user data is stored in an SQLite database. When running the socket and client scripts the Tkinter window launches and the user is able to log in. However, when the user logs out and tries to log in as another user the program gets stuck.
LogInCommand() I think is getting stuck as it only fails when running again. I believe it has to do with how I am discounting from the socket. However, I need the socket to constantly run for the app to work.
Socket :
import socket
import sqlite3
Connection = sqlite3.connect('BankAccounts.sqlite')
st = socket.socket() #Creating a socket object. Second socket is the class.
print("socket ready")
st.bind(("Localhost",8080)) #Creating the socket using port 8080 (IP address + port number)
st.listen(100) #Only allowing a maximum of two connections at a time.
print("waiting for connections")
while True:
Client, address = st.accept() #Creating two variables and setting them equal to the accept object method.
# print("connected with", address)
print("Banking")
cdata = Client.recv(1024).decode()
print("in loop")
if not cdata:
break
if "*li#" in cdata:
print(cdata)
data = cdata.split("*li#")
cred = data[1].split("##")
sql = "SELECT * FROM bankusers"
curser = Connection.execute(sql)
username = cred[0]
password = cred[1]
for row in curser:
if row[0] == username and row[1] == password:
balance = str(row[2])
print(type(balance))
login = ("*Suc*"+balance)
print(login)
Client.send(bytes(f"{login}",'utf-8'))
Client.close()
Client :
from tkinter import *
from tkinter import *
from Bank import *
import socket
root = Tk()
Client = socket.socket()
Client.connect(("Localhost",8080))
import json
login = "Failed"
import sqlite3
Connection = sqlite3.connect('BankAccounts.sqlite')
def localuser(username,password,balance):
user1 = {
"username": username,
"password": password,
"balance" : balance
}
return user1
def sqlupdate():
with open("bankusers.json","r") as file:
reader = json.load(file)
balance2 = reader["balance"]
username2 = reader["username"]
print(balance2)
print(username2)
sql = f"UPDATE bankusers SET balance = {balance2} where username = '{username2}'"
Connection.execute(sql)
Connection.commit()
def updatenewuser(username, amount):
try:
sql1 = f"Select * from bankusers where username = '{username}' "
data = Connection.execute(sql1)
for row in data:
newbalance = int(row[2]) + int(amount)
sql2 = f"Update bankusers Set balance = {newbalance} where username = '{username}'"
Connection.execute(sql2)
except:
print("The user does not exist")
def logout():
canvas.delete("all")
label1 = Label(root,text = "Username")
label2 = Label(root,text = "Password")
global usernamebox
global passwordbox
usernamebox = Entry(root,width = 20)
passwordbox = Entry(root,width = 20)
buttonLogin = Button(root,text = "Login",width=10, command=LoginCommand)
canvas.create_window(230,100,window = label1)
canvas.create_window(230,150,window = label2)
canvas.create_window(400,100,window = usernamebox)
canvas.create_window(400,150,window = passwordbox)
canvas.create_window(400,200,window = buttonLogin)
canvas.pack()
def pay():
if name.get()==username1:
trylabel = Label(root,text= "You are unable to send money to yourself",)
canvas.create_window(400,250,window=trylabel)
else:
canvas.delete("all")
try:
sql2 = f"SELECT * FROM bankusers where username = '{username1}'"
curser = Connection.execute(sql2)
for row in curser:
if int(amountbox.get()) <= row[2]:
print("Sent")
newamount = row[2] - int(amountbox.get())
with open("bankusers.json", "w") as file:
user = localuser(username1,password1,newamount)
json.dump(user,file,indent=3)
sqlupdate()
updatenewuser(name.get(),int(amountbox.get()))
label1 = Label(root,text="Transaction Succsfull")
backbut = Button(root,text= "Done", command=mainscreen)
canvas.create_window(400,100,window=label1)
canvas.create_window(400,200,window=backbut)
else:
canvas.delete("all")
label1 = Label(root,text = "Transaction Failed. Please ensure you have suffcient funds for this transaction")
canvas.create_window(400,200,window=label1)
backbut = Button(root,text= "Done", command=mainscreen)
canvas.create_window(400,300,window=backbut)
except:
label1 = Label(root,text = "Transaction Failed. Please check recipient name and amount")
canvas.create_window(400,200,window=label1)
backbut = Button(root,text= "Done", command=mainscreen)
canvas.create_window(400,300,window=backbut)
def transact():
print("")
canvas.delete("all")
global name
label1 = Label(root,text = "Person Receiving:")
label2 = Label(root,text = "Amount:")
name = Entry(root,width = 20)
global amountbox
amountbox = Entry(root,width = 20)
paybutt = Button(root,text = "Pay", command = pay)
backbutton = Button(root,text = "Back",command = mainscreen)
canvas.create_window(350,300,window=backbutton)
canvas.create_window(450,100,window = name)
canvas.create_window(250,100,window=label1)
canvas.create_window(250,200,window=label2)
canvas.create_window(450,300,window=paybutt)
canvas.create_window(450,200,window=amountbox)
return
def LoginCommand():
count = 1
li = "*li#"
cred = li+usernamebox.get()+"##"+passwordbox.get()
Client.send((bytes(cred,"utf-8")))
message = Client.recv(1024).decode()
print(message)
if "*Suc*" in message:
count = 0
login = "Succsess"
global username1
global password1
global balance1
username1 = usernamebox.get()
password1 = passwordbox.get()
usernamebox.destroy()
passwordbox.destroy()
canvas.delete("all")
balance1 = message.split("*Suc*")
user = localuser(username1,password1,balance1[1])
with open("bankusers.json", "w") as file:
json.dump(user,file,indent=3)
mainscreen()
if count == 1:
global label2
label2 = Label(root,text = "Login Failed. Please Try Again")
canvas.create_window(400,250,window = label2)
def mainscreen():
with open("bankusers.json","r") as file:
reader = json.load(file)
balance2 = reader["balance"]
label2.destroy()
canvas.delete("all")
label1 = Label(root, text = f"Available Balance: R{balance2}")
buttonLogout = Button(root,text = "Log Out", command=logout)
buttonTrans = Button(root,text = "Transact", command=transact)
canvas.create_window(400,100,window = label1)
canvas.create_window(350,200,window=buttonLogout)
canvas.create_window(450,200,window = buttonTrans)
canvas.pack()
root.title("Raindrop Bank")
canvas = Canvas(root, width = 800, height = 400)
label1 = Label(root,text = "Username")
label2 = Label(root,text = "Password")
usernamebox = Entry(root,width = 20)
passwordbox = Entry(root,width = 20)
buttonLogin = Button(root,text = "Login",width=10, command=LoginCommand)
canvas.create_window(230,100,window = label1)
canvas.create_window(230,150,window = label2)
canvas.create_window(400,100,window = usernamebox)
canvas.create_window(400,150,window = passwordbox)
canvas.create_window(400,200,window = buttonLogin)
canvas.pack()
root.mainloop()

How to mock connection for airflow's Livy Operator using unittest.mock

#mock.patch.dict(
"os.environ",
AIRFLOW_CONN_LIVY_HOOK = "http://www.google.com",
clear= True
)
class TestLivyOperator(unittest.TestCase):
def setUp(self):
super().setUp()
self.dag = DAG(
dag_id = "test_livy",
default_args = {
"owner" : "xyz",
"start_date" : datetime(2022, 8, 16),
},
)
#requests_mock.mock()
def test_payload(self, mock_request):
task = LivyOperator(
task_id = "task_1",
class_name = "com.precious.myClass"
executor_memory = "512m",
executor_cores = 3,
arg = spark_args,
livy_conn_id = AIRFLOW_CONN_LIVY_HOOK,
dag = self.dag,
)
I get connection not defined error
airflow.exceptions.AirflowNotFoundException: The conn_id 'AIRFLOW_CONN_LIVY_HOOK' isn't defined
As per the Airflow doc
https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html#storing-connections-in-environment-variables
The naming convention is AIRFLOW_CONN_{CONN_ID}, all uppercase (note the single underscores surrounding CONN). So if your connection id is my_prod_db then the variable name should be AIRFLOW_CONN_MY_PROD_DB.Hence the above code should be
#requests_mock.mock()
def test_payload(self, mock_request):
task = LivyOperator(
task_id = "task_1",
class_name = "com.precious.myClass"
executor_memory = "512m",
executor_cores = 3,
arg = spark_args,
livy_conn_id = "livy_hook",
dag = self.dag,
)

web scraper Python AttributeError: 'NoneType' object has no attribute 'get'

Hi Can anyone know how to troubleshoot this.
url = "https://www.zillow.com/walnut-ca/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Walnut%2C%20CA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-117.93482729053105%2C%22east%22%3A-117.75286623096073%2C%22south%22%3A33.93783156520187%2C%22north%22%3A34.06392018234896%7D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A12%2C%22filterState%22%3A%7B%22price%22%3A%7B%22min%22%3A400000%2C%22max%22%3A700000%7D%2C%22mp%22%3A%7B%22min%22%3A1448%2C%22max%22%3A2535%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D"
d = {'key':'value'}
print(d)
d['new key'] = 'new value'
print(d)
query_houses = {}
house_no = 0
while True:
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
houses = soup.find_all('article',{'class':'list-card list-card-short list-card_not-saved'})
for house in houses:
location = house.find('address',{'class': 'list-card-addr'})
value = house.find('div',{'class': 'list-card-price'})
detail = house.find('ul', {'class':'list-card-details'})
seller = house.find('div',{'class':'list-card-truncate'})
link = house.find('a', {'class': 'list-card-link'}).get('href')
house_response = requests.get(link)
house_data = house_response.text
house_soup = BeautifulSoup(house_data, 'html.parser')
square = house_soup.find('span',{'class':'ds-bed-bath-living-area'})
year_build = house_soup.find('span',{'class':'ds-body ds-home-fact-value'})
estimated_sales_range = house_soup.find('div',{'class':'Spacer-sc-17suqs2-0 pfWXf'})
house_no+=1
query_houses[house_no] = [location, value, detail, seller, link, square, year_build, estimated_sales_range]
url_tag = soup.find('a',{'title':'Next-page'})
if url_tag.get('href'):
url= 'https://zillow.com' + url_tag.get('href')
print(url)
else:
break

Change initialize method in subclass of an R6 class

Let's say I have a R6 class Person:
library(R6)
Person <- R6Class("Person",
public = list(name = NA, hair = NA,
initialize = function(name, hair) {
self$name <- name
self$hair <- hair
self$greet()
},
greet = function() {
cat("Hello, my name is ", self$name, ".\n", sep = "")
})
)
If I want to create a subclass whose initialize method should be the same except for adding one more variable to self how would I do this?
I tried the following:
PersonWithSurname <- R6Class("PersonWithSurname",
inherit = Person,
public = list(surname = NA,
initialize = function(name, surname, hair) {
Person$new(name, hair)
self$surname <- surname
})
)
However when I create a new instance of class PersonWithSurname the fields name and hair are NA, i.e. the default value of class Person.
PersonWithSurname$new("John", "Doe", "brown")
Hello, my name is John.
<PersonWithSurname>
Inherits from: <Person>
Public:
clone: function (deep = FALSE)
greet: function ()
hair: NA
initialize: function (name, surname, hair)
name: NA
surname: Doe
In Python I would do the following:
class Person(object):
def __init__(self, name, hair):
self.name = name
self.hair = hair
self.greet()
def greet(self):
print "Hello, my name is " + self.name
class PersonWithSurname(Person):
def __init__(self, name, surname, hair):
Person.__init__(self, name, hair)
self.surname = surname
R6 works very much like Python in this regard; that is, you just call initialize on the super object:
PersonWithSurname <- R6Class("PersonWithSurname",
inherit = Person,
public = list(surname = NA,
initialize = function(name, surname, hair) {
super$initialize(name, hair)
self$surname <- surname
})
)

Resources