Can't scrape an image url from Zara - web-scraping

I am trying to scrape an image url from Zara, but the only think I get back is the url of the transparent background.
This is the link Im trying to scrape: https://static.zara.net/photos///2022/V/0/1/p/9598/176/406/2/w/850/9598176406_1_1_1.jpg?ts=1640187784252
This is the link I keep getting:
https://static.zara.net/stdstatic/1.249.0-b.13/images/transparent-background.png'
Any ideas? This is my code. Thank you in advance!!
*Note: I used extract() in the image, not extract_first(), to see if there were several links, but they are all the same.
import scrapy
from scrapy.linkextractors import LinkExtractor
from Zara.items import Producto
class ZaraSpider(scrapy.Spider):
name = 'zara'
allowed_domains = ['zara.com']
start_urls = [
'https://www.zara.com/es/es/jersey-punto-cuello-subido-p09598176.html'
]
def parse(self, response):
producto = Producto()
# Extraemos los enlaces
links = LinkExtractor(
allow_domains=['zara.com'],
restrict_xpaths=["//a"],
allow="/es/es/"
).extract_links(response)
outlinks = [] # Lista con todos los enlaces
for link in links:
url = link.url
outlinks.append(url) # Añadimos el enlace en la lista
yield scrapy.Request(url, callback=self.parse) # Generamos la petición
product = response.xpath('//meta[#content="product"]').extract()
if product:
# Extraemos la url, el nombre del producto, la descripcion y su precio
producto['url'] = response.request.url
producto['nombre'] = response.xpath('//h1[#class="product-detail-info__name"]/text()').extract_first()
producto['precio'] = response.xpath('//span[#class="price__amount-current"]/text()').extract_first()
producto['descripcion'] = response.xpath('//div[#class="expandable-text__inner-content"]//text()').extract_first()
producto['imagen'] = response.xpath('//img[#class="media-image__image media__wrapper--media"]/#src').extract()
#producto['links'] = outlinks
yield producto'''

So the problem that it's generated with javascript. Try to request a webpage with scrapy shell and view the response, then you'll see that you can find to requested image url in another way.
import scrapy
from scrapy.linkextractors import LinkExtractor
# from Zara.items import Producto
class Producto(scrapy.Item):
url = scrapy.Field()
nombre = scrapy.Field()
precio = scrapy.Field()
descripcion = scrapy.Field()
imagen = scrapy.Field()
links = scrapy.Field()
class ZaraSpider(scrapy.Spider):
name = 'zara'
allowed_domains = ['zara.com']
start_urls = [
'https://www.zara.com/es/es/jersey-punto-cuello-subido-p09598176.html'
]
def parse(self, response):
producto = Producto()
# Extraemos los enlaces
links = LinkExtractor(
allow_domains=['zara.com'],
restrict_xpaths=["//a"],
allow="/es/es/"
).extract_links(response)
outlinks = [] # Lista con todos los enlaces
for link in links:
url = link.url
outlinks.append(url) # Añadimos el enlace en la lista
yield scrapy.Request(url, callback=self.parse) # Generamos la petición
product = response.xpath('//meta[#content="product"]').get()
if product:
# Extraemos la url, el nombre del producto, la descripcion y su precio
producto['url'] = response.request.url
producto['nombre'] = response.xpath('//h1[#class="product-detail-info__name"]/text()').get()
producto['precio'] = response.xpath('//span[#class="price__amount-current"]/text()').get()
producto['descripcion'] = response.xpath('//div[#class="expandable-text__inner-content"]//text()').get()
producto['imagen'] = response.xpath('//meta[#property="og:image"]/#content').get()
#producto['links'] = outlinks
yield producto
BTW check out CrawlSpider.

Related

unknown url type: 'URL' - this error i am getting. can any one help me for this

import csv
import re
from urllib.request import urlopen
with open('Input_textscrapping_CSV.csv', "rt") as f:
reader = csv.reader(f)
for line in reader:
#id = line[0]
url = line[1] # assuming your url is your first column
print(url)
#print(id,url)
#the following is added by me
page = urlopen(url)
html = page.read().decode("utf-8")
pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>", "", title) # Remove HTML tags
print(title)
I tried the above code but it is giving error is unknown url type: 'URL'.

How to solve the "AttributeError: 'NoneType' object has no attribute 'find'"

I'm trying to run this code below that I got from this site. However, it keeps giving "AttributeError: 'NoneType' object has no attribute 'find'". (40th line) I'd be so glad if you could help me solve this issue.
import requests
from bs4 import BeautifulSoup
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def obtener_resultados(termino_busqueda, numero_resultados, codigo_lenguaje):
url_google = 'https://www.google.com/search?q={}&num={}&hl={}'.format(termino_busqueda, numero_resultados, codigo_lenguaje)
respuesta = requests.get(url_google, headers=USER_AGENT)
respuesta.raise_for_status()
return termino_busqueda, respuesta.text
def procesar_resultados(html, palabra):
soup = BeautifulSoup(html, 'html.parser')
resultados_encontrados = []
bloque = soup.find_all("div", class_="g")
for resultado in bloque:
titulo = resultado.find('h3').string
resultados_encontrados.append(titulo)
return resultados_encontrados
def scrape(termino_busqueda, numero_resultados, codigo_lenguaje):
palabra, html = obtener_resultados(termino_busqueda, numero_resultados, codigo_lenguaje)
resultados = procesar_resultados(html, palabra)
return resultados
if __name__ == '__main__':
palabra = 'Quantika14'
h5 = (palabra, 1, "es")
h6 = (h5[0])
username=h6
url = 'https://www.twitter.com/'+username
r = requests.get(url)
soup = BeautifulSoup(r.content,'html.parser')
f = soup.find('li', class_="ProfileNav-item--followers")
title=f.find('a')['title']
print (title)
g=soup.find_all('title', limit=1)
h = soup.select('.bio',limit=1)
title2 =g
print (title2)
title3=h
print(title3)
To get rid of Noneype error you can apply if else None statement
Example: Assuming your element selection is correct
title=f.find('a')['title'] if f.find('a') else None

POST don't find

I have a problem with my code. When a I do a post don't find the autor and create a new register in table, but after with the first "try" find 2 with the same autor and launch a error.
Why don't find the try inside the reques.method ='POST'?. I know entry inside the if because in console print the except condition: "Usuario no exite".
Best Regards.
def usuario(request):
user_actual=request.user.id
user = User.objects.get(id=user_actual)
new_form_data = {}
miUsuario = FormularioUsuario(new_form_data)
usuario = {}
print(user)
print(user_actual)
print(Usuario.objects.get(autor= user_actual))
try:
usuario = Usuario.objects.get(autor= user_actual)
new_form_data = {}
new_form_data['irpf'] = usuario.irpf
new_form_data['reduccion'] = usuario.reduccion
new_form_data['guardar_normal'] = usuario.guardar_normal
new_form_data['guardar_ertain'] = usuario.guardar_ertain
new_form_data['guardar_berezi'] = usuario.guardar_berezi
miUsuario = FormularioUsuario(new_form_data)
except:
print("No existe datos usuario")
new_form_data = {}
new_form_data['irpf'] = 0.0
new_form_data['reduccion'] = 0.0
new_form_data['guardar_normal'] = False
new_form_data['guardar_ertain'] = False
new_form_data['guardar_berezi'] = False
miUsuario = FormularioUsuario(new_form_data)
if request.method=='POST': #and ('guardar' in request.POST):
miUsuario=FormularioUsuario(request.POST)
print(user_actual)
try:
usuario = Usuario.objects.get(autor= user_actual)
usuario.irpf = miUsuario.data['irpf']
usuario.reduccion = miUsuario.data['reduccion']
usuario.guardar_normal = miUsuario.data['guardar_normal']
usuario.guardar_ertain = miUsuario.data['guardar_ertain']
usuario.guardar_berezi = miUsuario.data['guardar_berezi']
usuario.save()
print("Guardado datos usuario")
except:
print("Usuario no existe")
usuario = Usuario(autor= user,
irpf= miUsuario.data['irpf'],
reduccion = miUsuario.data['reduccion'],
guardar_normal='guardar_normal' in miUsuario.data,
guardar_ertain='guardar_ertain' in miUsuario.data,
guardar_berezi='guardar_berezi' in miUsuario.data,)
usuario.save()
new_form_data = {}
new_form_data['irpf'] = usuario.irpf
new_form_data['reduccion'] = usuario.reduccion
new_form_data['guardar_normal'] = usuario.guardar_normal
new_form_data['guardar_ertain'] = usuario.guardar_ertain
new_form_data['guardar_berezi'] = usuario.guardar_berezi
miUsuario = FormularioUsuario(new_form_data)
return render(request, "BieleGastosApp/usuario_datos.html", {'usuario': miUsuario})
'''
A good first step would be to find out which line in the try block is giving error. Use except Exception as e: and add print(e) to the except block. That should direct your attention to the specific error that is generated and help you make progress.

Django Nested admin

here is my code:
models.py
from django.db import models
class Country(models.Model):
country_name = models.CharField(max_length = 20, default = '', )
country_other_details = models.CharField(max_length = 100, default = '', null = True)
class Meta:
verbose_name_plural = "Countries"
def __str__(self):
return self.country_name
class State(models.Model):
in_country = models.ForeignKey(Country, related_name='in_country',on_delete=models.DO_NOTHING)
state_name = models.CharField(max_length = 20, default = '')
state_other_details = models.CharField(max_length=100, default = '', null = True)
def __str__(self):
return self.state_name
class City(models.Model):
in_state = models.ForeignKey(State, related_name='in_state',on_delete=models.DO_NOTHING)
city_name = models.CharField(max_length = 20, default = '')
city_other_details = models.CharField(max_length = 100, null = True)
class Meta:
verbose_name_plural = "Cities"
def __str__(self):
return self.city_name
forms.py
from django.forms.models import inlineformset_factory
from django.forms.models import BaseInlineFormSet
from .models import Country, State, City
from django.forms.models import BaseInlineFormSet, inlineformset_factory
from django.utils.translation import ugettext_lazy as _
# from publishing.utils.forms import is_empty_form, is_form_persisted
CityFormset = inlineformset_factory(State, City, extra=2, fields=("city_name",))
class BaseStateFormset(BaseInlineFormSet):
def add_fields(self, form, index):
super(BaseStateFormset, self).add_fields(form, index)
# save the formset in the 'nested' property
form.nested = CityFormset(
instance=form.instance,
data=form.data if form.is_bound else None,
files=form.files if form.is_bound else None,
prefix='address-%s-%s' % (
form.prefix,
CityFormset.get_default_prefix()),
# extra=1
)
def is_valid(self):
result = super(BaseStateFormset, self).is_valid()
if self.is_bound:
for form in self.forms:
if hasattr(form, 'nested'):
result = result and form.nested.is_valid()
return result
def save(self, commit=True):
result = super(BaseStateFormset, self).save(commit=commit)
for form in self.forms:
if hasattr(form, 'nested'):
if not self._should_delete_form(form):
form.nested.save(commit=commit)
return result
StateFormset = inlineformset_factory(Country, State, formset=BaseStateFormset, extra=2, fields=("state_name",))
views.py
from .models import Country, State, City
def manage_state(request, parent_id):
parent = get_object_or_404(Country, id=parent_id)
if request.method == 'POST':
formset = forms.StateFormset(request.POST, instance=parent)
if formset.is_valid():
formset.save()
# return redirect('parent_view', parent_id=parent.id)
return redirect(reverse('india:manage_state', kwargs={"parent_id": parent.id}))
else:
formset = forms.StateFormset(instance=parent)
return render(request, 'home.html', {
'parent':parent,
'children_formset':formset})
What i want is a single form that will create an object to Parent(Country) model and multiple object to Child(State) as according to country's object and multiple objects to Grand Child(City) according to corresponded State
for example:
<form method="POST">
Parentform:(Country)
<input name="country_name">
Childform:(State)
<input name="state_name">
GrandChildform:(City)
<input name = "City_name">
<button> Add GrandChild</button>
<button> Add Child </button>
<button>Add Parent</button>
</form>
also the add button should be able to add the more Countries, States and Cities dynamically to the form.
Any help, suggestions or references would be Grateful.
Thanks in Advance.
I got the solution. Thanks to PYTHON and Django.
As I got, We can edit our own admin using a built-in package django-nested-admin.
We don't require to built our Custom forms,views or anything.
I am Sharing Image of my Django-admin:
Image
To do it I am sharing the Descriptions:
First we install a package using pip:
pip install django-nested-admin.
Now we Add the library in settings.py:
INSTALLED_APPS[
...
'nested_admin',
...
]
Add the urls of the library in the urls.py:
url(r'^nested_admin/', include('nested_admin.urls')),
Register models in admin.py:
from django.contrib import admin
from .models import State, Country, City
from nested_admin import NestedModelAdmin, NestedStackedInline, NestedTabularInline
class CityTabularInline(NestedTabularInline):
model = City
extra = 1
class StateTabularInline(NestedTabularInline):
model = State
extra = 1
inlines = [CityTabularInline, ]
class CountryAdmin(NestedModelAdmin):
inlines = [StateTabularInline, ]
admin.site.register(Country, CountryAdmin)
Usage:
NestedModelAdmin: to extend the django ModelAdmin class and to be able to use differents
types of classes in the inlines attribute.
NestedStackedInline: to allow the addition of TabularInline classes in the inlines
attribute.
NestedTabularInline: to extend the TabularInline.
for more details please Visit.

Scrape html tables and combine data into mongodb with scrapy

I'm scraping from 2 differnt tables and want to combine the data into MongoDB
Now I have a problem with the second table I want to scrape.
The Table has 1 table header and 5 table rows
how can I scrape the table that the MongoDB field has all the elements(column) of the table
The table i want to scrape looks like this
https://codepen.io/linkslegend/pen/JjPrqLq
This is the code I have sofar
import scrapy
import pymongo
from ..items import testItem
class IssSpider(scrapy.Spider):
name = "test_spider"
start_urls = ["https://de.iss.fst.com/dichtungen/radialwellendichtringe/rwdr-mit-geschlossenem-kafig/ba"]
def parse(self, response):
self.log("I just visted:" + response.url)
urls = response.css('.details-button > a::attr(href)').extract()
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_details)
# follow paignation link
next_page_url = response.css('li.item > a.next::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
item = testItem()
item['Artikelnummer'] = response.css('td[data-th="Artikelnummer"]::text').extract_first().strip(),
item['Hersteller'] = response.css('td[data-th="Hersteller"]::text').extract_first().strip(),
item['Materialvariante'] = response.css('td[data-th="Materialvariante"]::text').extract_first().strip(),
item['Material'] = response.css('td[data-th="Material"]::text').extract_first().strip(),
item['Gewicht_Gramm'] = response.css('td[data-th="Gewicht (Gramm)"]::text').extract_first().strip(),
item['Gehaeusedurchmesser'] = response.css('td[data-th="Gehäusedurchmesser"]::text').extract_first().strip(),
item['Breite'] = response.css('td[data-th="Breite"]::text').extract_first().strip(),
item['Innendurchmesser'] = response.css('td[data-th="Innendurchmesser"]::text').extract_first().strip(),
item['Wellendurchmesser'] = response.css('td[data-th="Wellendurchmesser"]::text').extract_first().strip(),
item['Außendurchmesser'] = response.css('td[data-th="Außendurchmesser"]::text').extract_first().strip(),
for row in response.css('tr.offer'):
item['Lieferant'] = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
item['Anforderungsmenge'] = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
item['Lieferzeit'] = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
item['PreisproStueck'] = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
yield item
and this is the pipeline for mongodb
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class testPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient(
"localhost",
27017
)
db = self.conn["test_db"]
self.collection = db["test_tb"]
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
my current MongoDB looks like this
This is how I want MongoDB to look like
thanks for any help!
I think the problem is that you're overwriting the values for Lieferant, Anforderungsmenge, Lieferzeit & PreisproStueck in your loop.
You can try substituting it with the following:
def parse_details(self, response):
item = testItem()
...
lieferants = []
anforderungsmenges = []
lieferzeits = []
preisprostuecks = []
for row in response.css('tr.offer'):
lieferant = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
anforderungsmenge = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
lieferzeit = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
preisprostueck = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
lieferants.append(lieferant)
anforderungsmenges.append(anforderungsmenge)
lieferzeits.append(lieferzeit)
preisprostuecks.append(preisprostueck)
item['lieferants'] = lieferants
item['anforderungsmenges'] = anforderungsmenges
item['lieferzeits'] = lieferzeits
item['preisprostuecks'] = preisprostuecks
yield item
You can also get the lists directly instead of looping over them, similar to the below (untested) code:
def parse_details(self, response):
item = testItem()
...
item['lieferants'] = [lieferant.strip() for lieferant in response.css('tr.offer > td.vendor > span.offer-vendor::text').extract()]
item['anforderungsmenges'] = [anforderungsmenge.strip() for anforderungsmenge in response.css('td.replenishment-time > span.offer-replenishment-time::text').extract()]
item['lieferzeits'] = [lieferzeit.strip() for lieferzeit in response.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract()]
item['preisprostuecks'] = [preisprostueck.strip() for preisprostueck in response.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract()]
yield item

Resources