Get second attribute with Beautiful Soup - web-scraping

I've got this html and a series of images to get with this structure:
<portrait a>
<img class = "image" data = "random stuff" src = "image_link">
<portrait b>
<img class = "image" data = "random stuff" src = "image_link">
I want to get the image_link. I've tryed this but it didn't work. I don't understand how to get the second attribute with get().
i = d.find_all("img", class_ = "image")
for item in i:
img = item.get("src")
print(img)
Thanks

Try like this:
from bs4 import BeautifulSoup
htmlcontent = """
<portrait a>
<img class = "image" data = "random stuff" src = "image_link1">
<portrait b>
<img class = "image" data = "random stuff" src = "image_link2">
"""
soup = BeautifulSoup(htmlcontent,"lxml")
for item in soup.find_all("img",class_="image"):
print(item.get("src"))
Output:
image_link1
image_link2

Related

How to set data-attribute between image's src and alt attributes?

Order:
$img->src = 'someimage.webp';
$img->alt = null;
$img->{'data-ext'} = 'png';
$img->alt = 'somealt';
Result:
<img src="someimage.webp" alt="somealt" data-ext="png">
I'm trying to get this:
<img src="someimage.webp" data-ext="png" alt="somealt">
UPD
I found ugly solution, but this returns valid html
$img->src = 'someimage.webp" data-ext="png';
$img->alt = 'somealt';
If you want the data-attribute to be between src and alt and setting the alt to null in between, one option could be to getAllAttributes() and use krsort to sort the array keys in reverse order.
For example
$img->src = 'someimage.webp';
$img->alt = null;
$img->{'data-ext'} = 'png';
$img->alt = 'somealt';
$attrs = $img->getAllAttributes();
krsort($attrs);
$img->attr = $attrs;
echo $img->outertext();
Output
<img src="someimage.webp" data-ext="png" alt="somealt">

Django Nested admin

here is my code:
models.py
from django.db import models
class Country(models.Model):
country_name = models.CharField(max_length = 20, default = '', )
country_other_details = models.CharField(max_length = 100, default = '', null = True)
class Meta:
verbose_name_plural = "Countries"
def __str__(self):
return self.country_name
class State(models.Model):
in_country = models.ForeignKey(Country, related_name='in_country',on_delete=models.DO_NOTHING)
state_name = models.CharField(max_length = 20, default = '')
state_other_details = models.CharField(max_length=100, default = '', null = True)
def __str__(self):
return self.state_name
class City(models.Model):
in_state = models.ForeignKey(State, related_name='in_state',on_delete=models.DO_NOTHING)
city_name = models.CharField(max_length = 20, default = '')
city_other_details = models.CharField(max_length = 100, null = True)
class Meta:
verbose_name_plural = "Cities"
def __str__(self):
return self.city_name
forms.py
from django.forms.models import inlineformset_factory
from django.forms.models import BaseInlineFormSet
from .models import Country, State, City
from django.forms.models import BaseInlineFormSet, inlineformset_factory
from django.utils.translation import ugettext_lazy as _
# from publishing.utils.forms import is_empty_form, is_form_persisted
CityFormset = inlineformset_factory(State, City, extra=2, fields=("city_name",))
class BaseStateFormset(BaseInlineFormSet):
def add_fields(self, form, index):
super(BaseStateFormset, self).add_fields(form, index)
# save the formset in the 'nested' property
form.nested = CityFormset(
instance=form.instance,
data=form.data if form.is_bound else None,
files=form.files if form.is_bound else None,
prefix='address-%s-%s' % (
form.prefix,
CityFormset.get_default_prefix()),
# extra=1
)
def is_valid(self):
result = super(BaseStateFormset, self).is_valid()
if self.is_bound:
for form in self.forms:
if hasattr(form, 'nested'):
result = result and form.nested.is_valid()
return result
def save(self, commit=True):
result = super(BaseStateFormset, self).save(commit=commit)
for form in self.forms:
if hasattr(form, 'nested'):
if not self._should_delete_form(form):
form.nested.save(commit=commit)
return result
StateFormset = inlineformset_factory(Country, State, formset=BaseStateFormset, extra=2, fields=("state_name",))
views.py
from .models import Country, State, City
def manage_state(request, parent_id):
parent = get_object_or_404(Country, id=parent_id)
if request.method == 'POST':
formset = forms.StateFormset(request.POST, instance=parent)
if formset.is_valid():
formset.save()
# return redirect('parent_view', parent_id=parent.id)
return redirect(reverse('india:manage_state', kwargs={"parent_id": parent.id}))
else:
formset = forms.StateFormset(instance=parent)
return render(request, 'home.html', {
'parent':parent,
'children_formset':formset})
What i want is a single form that will create an object to Parent(Country) model and multiple object to Child(State) as according to country's object and multiple objects to Grand Child(City) according to corresponded State
for example:
<form method="POST">
Parentform:(Country)
<input name="country_name">
Childform:(State)
<input name="state_name">
GrandChildform:(City)
<input name = "City_name">
<button> Add GrandChild</button>
<button> Add Child </button>
<button>Add Parent</button>
</form>
also the add button should be able to add the more Countries, States and Cities dynamically to the form.
Any help, suggestions or references would be Grateful.
Thanks in Advance.
I got the solution. Thanks to PYTHON and Django.
As I got, We can edit our own admin using a built-in package django-nested-admin.
We don't require to built our Custom forms,views or anything.
I am Sharing Image of my Django-admin:
Image
To do it I am sharing the Descriptions:
First we install a package using pip:
pip install django-nested-admin.
Now we Add the library in settings.py:
INSTALLED_APPS[
...
'nested_admin',
...
]
Add the urls of the library in the urls.py:
url(r'^nested_admin/', include('nested_admin.urls')),
Register models in admin.py:
from django.contrib import admin
from .models import State, Country, City
from nested_admin import NestedModelAdmin, NestedStackedInline, NestedTabularInline
class CityTabularInline(NestedTabularInline):
model = City
extra = 1
class StateTabularInline(NestedTabularInline):
model = State
extra = 1
inlines = [CityTabularInline, ]
class CountryAdmin(NestedModelAdmin):
inlines = [StateTabularInline, ]
admin.site.register(Country, CountryAdmin)
Usage:
NestedModelAdmin: to extend the django ModelAdmin class and to be able to use differents
types of classes in the inlines attribute.
NestedStackedInline: to allow the addition of TabularInline classes in the inlines
attribute.
NestedTabularInline: to extend the TabularInline.
for more details please Visit.

Scrape html tables and combine data into mongodb with scrapy

I'm scraping from 2 differnt tables and want to combine the data into MongoDB
Now I have a problem with the second table I want to scrape.
The Table has 1 table header and 5 table rows
how can I scrape the table that the MongoDB field has all the elements(column) of the table
The table i want to scrape looks like this
https://codepen.io/linkslegend/pen/JjPrqLq
This is the code I have sofar
import scrapy
import pymongo
from ..items import testItem
class IssSpider(scrapy.Spider):
name = "test_spider"
start_urls = ["https://de.iss.fst.com/dichtungen/radialwellendichtringe/rwdr-mit-geschlossenem-kafig/ba"]
def parse(self, response):
self.log("I just visted:" + response.url)
urls = response.css('.details-button > a::attr(href)').extract()
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_details)
# follow paignation link
next_page_url = response.css('li.item > a.next::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
item = testItem()
item['Artikelnummer'] = response.css('td[data-th="Artikelnummer"]::text').extract_first().strip(),
item['Hersteller'] = response.css('td[data-th="Hersteller"]::text').extract_first().strip(),
item['Materialvariante'] = response.css('td[data-th="Materialvariante"]::text').extract_first().strip(),
item['Material'] = response.css('td[data-th="Material"]::text').extract_first().strip(),
item['Gewicht_Gramm'] = response.css('td[data-th="Gewicht (Gramm)"]::text').extract_first().strip(),
item['Gehaeusedurchmesser'] = response.css('td[data-th="Gehäusedurchmesser"]::text').extract_first().strip(),
item['Breite'] = response.css('td[data-th="Breite"]::text').extract_first().strip(),
item['Innendurchmesser'] = response.css('td[data-th="Innendurchmesser"]::text').extract_first().strip(),
item['Wellendurchmesser'] = response.css('td[data-th="Wellendurchmesser"]::text').extract_first().strip(),
item['Außendurchmesser'] = response.css('td[data-th="Außendurchmesser"]::text').extract_first().strip(),
for row in response.css('tr.offer'):
item['Lieferant'] = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
item['Anforderungsmenge'] = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
item['Lieferzeit'] = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
item['PreisproStueck'] = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
yield item
and this is the pipeline for mongodb
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class testPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient(
"localhost",
27017
)
db = self.conn["test_db"]
self.collection = db["test_tb"]
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
my current MongoDB looks like this
This is how I want MongoDB to look like
thanks for any help!
I think the problem is that you're overwriting the values for Lieferant, Anforderungsmenge, Lieferzeit & PreisproStueck in your loop.
You can try substituting it with the following:
def parse_details(self, response):
item = testItem()
...
lieferants = []
anforderungsmenges = []
lieferzeits = []
preisprostuecks = []
for row in response.css('tr.offer'):
lieferant = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
anforderungsmenge = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
lieferzeit = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
preisprostueck = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
lieferants.append(lieferant)
anforderungsmenges.append(anforderungsmenge)
lieferzeits.append(lieferzeit)
preisprostuecks.append(preisprostueck)
item['lieferants'] = lieferants
item['anforderungsmenges'] = anforderungsmenges
item['lieferzeits'] = lieferzeits
item['preisprostuecks'] = preisprostuecks
yield item
You can also get the lists directly instead of looping over them, similar to the below (untested) code:
def parse_details(self, response):
item = testItem()
...
item['lieferants'] = [lieferant.strip() for lieferant in response.css('tr.offer > td.vendor > span.offer-vendor::text').extract()]
item['anforderungsmenges'] = [anforderungsmenge.strip() for anforderungsmenge in response.css('td.replenishment-time > span.offer-replenishment-time::text').extract()]
item['lieferzeits'] = [lieferzeit.strip() for lieferzeit in response.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract()]
item['preisprostuecks'] = [preisprostueck.strip() for preisprostueck in response.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract()]
yield item

openpyxl - How to preserve xlsx custom properties

How do I preserve custom properties from xlsx template which I am modifying with openpyxl? When I save() workbook using openpyxl these custom properties vanish!
Custom properties can be found here:-
On Mac -> Go to File Menu in Excel -> Properties ... -> Custom tab ->
Properties section
I am posting a pure python solution to reading and writing Workbook.CustomDocumentProperties just because I am currently also feeling the pain of not having this in openpyxl, and I needed a quick workaround for a personal automation project.
In fact, I will try to implement this feature (and hopefully later Worksheet.CustomProperties) into openpyxl myself if I can get my head around how to do all the plumbing the library needs: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1003
Update: I pushed my contribution and it should be accepted and merged shortly :) https://foss.heptapod.net/openpyxl/openpyxl/-/merge_requests/384
So for now, here is a workaround, converting the .xlsx to .zip, then reading and writing the .xml files in the zip directly, and then renaming to .xlsx at the end.
To read Workbook.CustomDocumentProperties you can do this - only very slightly modified from this great answer: https://stackoverflow.com/a/46919795/9792594
from lxml import etree as ET
import zipfile
def get_custom_doc_properties(filename):
path_file = os.path.abspath(filename)
base, ext = os.path.splitext(path_file)
zip_filename = base + ".zip"
os.rename(path_file, zip_filename)
main_ns = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}"
docPr_ns = "{http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}"
docPr_type = "{http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes}" #i4, r8, filetime, bool, lpwstr
r_ns = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
cusPr_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customProperty"
with zipfile.ZipFile(zip_filename) as zip:
props = zip.open('docProps/custom.xml')
text = props.read()
xml = ET.fromstring(text)
workbook_props = {}
for child in XML:
if (child.tag == f"{docPr_ns}property"):
for cusPr in child:
workbook_props[child.attrib['name']] = cusPr.text
return workbook_props
#call like this:
get_custom_doc_properties(f'./example.xlsx')
And to add one prop to a document which already has custom doc props (and therefore already has a 'docProps/custom.xml' file), is pretty easy and we just append one more custom property to the xml.
(However, if the document had no current custom doc props, then we need to generate the 'docProps/custom.xml' file from scratch, as well as add a content override and a relationship - see code comments):
import os
from lxml import etree as ET
import zipfile
import shutil
import datetime
from tempfile import NamedTemporaryFile
def set_workbook_custom_document_properties(filename, cus_doc_prop_name, cus_doc_prop_val):
if not isinstance(cus_doc_prop_name, str):
print("you must supply a string as the 'cus_doc_prop_name'")
return
if isinstance(cus_doc_prop_val, str):
docPr_type_suffix = "lpwstr"
cus_doc_prop_str = cus_doc_prop_val
elif isinstance(cus_doc_prop_val, int):
docPr_type_suffix = "i4"
cus_doc_prop_str = str(cus_doc_prop_val)
elif isinstance(cus_doc_prop_val, float):
docPr_type_suffix = "r8"
cus_doc_prop_str = str(cus_doc_prop_val)
elif isinstance(cus_doc_prop_val, bool):
docPr_type_suffix = "bool"
cus_doc_prop_str = str(cus_doc_prop_val)
elif isinstance(cus_doc_prop_val, datetime.datetime):
docPr_type_suffix = "filetime"
cus_doc_prop_str = cus_doc_prop_val.strftime("%Y-%m-%dT%H:%M:%SZ")
else:
print("you must supply a string, int, float, bool, or date, as the 'cus_doc_prop_val'")
return
path_file = os.path.abspath(filename)
base, ext = os.path.splitext(path_file)
zip_filename = base + ".zip"
os.rename(path_file, zip_filename)
main = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
main_ns = "{%s}" % main
docPr = "http://schemas.openxmlformats.org/officeDocument/2006/custom-properties"
docPr_ns = "{%s}" % docPr
docPr_type = "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"
docPr_type_ns = "{%s}" % docPr_type #i4, r8, filetime, bool, lpwstr
docPr_rel_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties"
docPr_content_type = "application/vnd.openxmlformats-officedocument.custom-properties+xml"
r_ns = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
cusPr_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customProperty"
xml_declaration = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
base_xml = '{dec}<Properties xmlns="{docPr}" xmlns:vt="{docPr_type}"></Properties>'.format(dec=xml_declaration, docPr=docPr, docPr_type=docPr_type).encode('utf-8')
with NamedTemporaryFile() as tmp_file:
tmpname = os.path.basename(tmp_file.name)
with zipfile.ZipFile(zip_filename, 'r') as zip_in:
with zipfile.ZipFile(tmpname, 'w') as zip_out:
zip_out.comment = zip_in.comment # preserve the comment
custom_present = 'docProps/custom.xml' in zip_in.namelist()
for item in zip_in.infolist():
if item.filename == 'docProps/custom.xml':
custom_xml = ET.fromstring(zip_in.read(item.filename))
elif custom_present == False and item.filename == '_rels/.rels':
rels_xml = ET.fromstring(zip_in.read(item.filename))
elif custom_present == False and item.filename == '[Content_Types].xml':
content_types_xml = ET.fromstring(zip_in.read(item.filename))
else:
zip_out.writestr(item, zip_in.read(item.filename))
if custom_present:
# if custom.xml is already present we just need to append:
max_pid = 1
for node in custom_xml:
max_pid = max(int(node.attrib['pid']), max_pid)
else:
# if custom.xml is not present, we need to create it
# and also to add an override to [Content_Types].xml
# and also to add a relationship to _rels/.rels
custom_xml = ET.parse(BytesIO(base_xml)).getroot()
max_pid = 1
child_override = ET.SubElement(content_types_xml, "Override")
child_override.attrib['ContentType'] = docPr_content_type
child_override.attrib['PartName'] = '/docProps/custom.xml'
zip_out.writestr('[Content_Types].xml', ET.tostring(content_types_xml))
max_rid = 0
for node in rels_xml:
max_rid = max(int(node.attrib['Id'].replace("rId", "")), max_rid)
child_rel = ET.SubElement(rels_xml, "Relationship")
child_rel.attrib['Type'] = docPr_rel_type
child_rel.attrib['Target'] = 'docProps/custom.xml'
child_rel.attrib['Id'] = "rID" + str(max_rid + 1)
zip_out.writestr('_rels/.rels', ET.tostring(rels_xml))
child = ET.SubElement(custom_xml, "property")
child.attrib['name'] = cus_doc_prop_name
child.attrib['pid'] = str(max_pid + 1)
child.attrib['fmtid'] = "{D5CDD505-2E9C-101B-9397-08002B2CF9AE}"
val = ET.SubElement(child, f"{docPr_type_ns}{docPr_type_suffix}")
val.text = cus_doc_prop_str
print(ET.tostring(custom_xml, pretty_print=True))
zip_out.writestr('docProps/custom.xml', ET.tostring(custom_xml))
zip_out.close()
zip_in.close()
shutil.copyfile(tmpname, zip_filename)
os.rename(zip_filename, path_file)
#call it like this:
set_workbook_custom_document_properties(f'./example.xlsx', "testDocProp7", 2.5)

form.SchemaEditForm with ignoreContext

everybody
I'm trying to make an edit form for an object other than the context. It's a dictionary stored in the session. I'm following Martin Aspeli Schema Drive Forms. It should be easy, but for some reason, the edit form doesn't load any data. Maybe I'm losing some simple detail, but I can't find it. I made ignoreContext=True and tried returning a dictionary and after that an instance that implements the schema, but it didn't work.
from plone.directives import form, dexterity
from zope.interface import invariant, Invalid, implements
class IUser(form.Schema):
'''Represents an user'''
uid = schema.TextLine(
title = _(u'Login'),
)
gn = schema.TextLine(
title = _(u'Name'),
)
sn = schema.TextLine(
title = _(u'Surname'),
)
uniqueIdentifier = schema.TextLine(
title = _(u'Identifier'),
description = _(u'Ej. V11222333'),
)
accClass = schema.TextLine(
title = _(u'User class'),
)
dateExpiration = schema.TextLine(
title = _(u'Date of expiration'),
)
class User:
implements(IUser)
def __init__(self, **kargs):
self.uid = kargs['uid']
self.gn = kargs['givenName']
self.sn = kargs['sn']
self.uniqueIdentifier = kargs['uniqueIdentifier']
self.accClass = kargs['accClass']
self.dateExpiration = kargs.get('dateExpiration', '2015/06')
class Edit(form.SchemaEditForm):
'''Modify User'''
grok.context(IUserManager) # It's ok. This is a view of an IUserManager object
grok.require('zope2.View') # <- just for testing
grok.name('modify')
schema = IUser
ignoreContext = True
label = 'Modify an User'
def getContent(self):
# I've tried returning a dictionary too, but it's useless
user = User(**SessionUsers(self.context).current())
return user

Resources