Django-MPTT full path to child pages how to make? - django-urls

I'm start using Django-MPTT app to get a tree-based approach on my Django-site pages.
For ex. I have pages with sub pages:
Trance:
Vocal Trance(sub page)
Hard Trance(sub page)
Breaks:
Atmo Breaks(sub page)
Progressive Breaks(sub page)
How can I get access to them from urls.py?
What pattern will help?
Do I need to store Full_path in model or it can be done via url pattern?

I assume you mean you want to do URLs like this:
/trance/
/trance/vocal-trance/
/trance/hard-trace/
/breaks/
/breaks/atmo-breaks/
/breaks/progressive-breaks/
If so, it's probably best to store the url fragment in your model. Something like:
from mptt.models import MPTTModel
from django.db import models
from django.template.defaultfilters import slugify
class Page(MPTTModel):
name = models.CharField(max_length=50)
slug = models.CharField(max_length=50,null=True)
url = models.CharField(max_length=255,null=True)
def save(self, *args, **kwargs)
if self.slug is None:
# create a slug that's unique to siblings
slug = slugify(self.name)
self.slug = slug
siblings = self.get_siblings()
i = 1
while siblings.filter(slug=self.slug).exists():
i += 1
self.slug = slug + '-%d' % i
# now create a URL based on parent's url + slug
if self.parent:
self.url = '%s/%s' % (self.parent.url, self.slug)
else:
self.url = self.slug
super(Page, self).save(*args, **kwargs)
Then add a URL pattern:
(r'^pages/(?P<page_url>[\w\d_/-]+)/$', 'pages.views.show_page'),
And in your view you can just fetch the right page:
def show_page(request, page_url=None):
page = get_object_or_404(Page, url=page_url)
...

Thank you for your attention to my problem.
See,How I finally do it.
models.py
class WebPage(MPTTModel):
slug=RuSlugField(max_length=20,unique=True)
title=models.CharField(max_length=50)
content=models.TextField()
parent=TreeForeignKey('self',null=True,blank=True,related_name='children')
class MPTTMeta:
order_insertion_by=['slug']
def get_absolute_url(self):#TODO:: replace with get_ancestors
url = "/%s/" % self.slug
page = self
while page.parent:
url = "/%s%s" % (page.parent.slug,url)
page = page.parent
return url
urls.py
urlpatterns = patterns('website.views',
url(r"^add/$", "add_page",name="add"),
url(r"^(?P<full_slug>.*)/add/$", "add_page",name="add"),
url(r"^(?P<full_slug>.*)/edit/$", "edit_page",name="edit"),
url(r'^$', ListView.as_view(model=WebPage,template_name='index.html',context_object_name="webpages_list",),name='index'),
url(r"^(?P<full_slug>.*)/$", "page", name="page"),
)
views.py
def page(request, full_slug):
# Make a list from full_slug.
# For ex. /trance/progressive_trance/fonarev -> ['trance','progressive_trance','fonarev']
slugs=full_slug.split('/')
page=None
# Get a page by it's slug
if len(slugs)>1:
page=get_object_or_404(WebPage,slug=slugs[-1])#slugs=['trance','vocal_trance'] -> 'vocal_trance'
elif len(slugs)==1:
page=get_object_or_404(WebPage,slug=slugs[0])#slugs=['trance'] -> 'trance'
# Check if page url matches requested full_slug
if page.get_absolute_url().strip('/') == full_slug:
return render_to_response('page.html', {'page': page},context_instance=RequestContext(request))
else:
raise Http404
def edit_page(request,full_slug):
slugs=full_slug.split('/')
page=None
if len(slugs)>1:
page=get_object_or_404(WebPage,slug=slugs[-1])
elif len(slugs)==1:
page=get_object_or_404(WebPage,slug=slugs[0])
if not page.get_absolute_url().strip('/') == full_slug:
raise Http404
# Send POST data for update an existing page.Update a page.
if request.method=='POST':
form=WebPageForm(request.POST, instance=page)
if form.is_valid():
form.save()
return HttpResponseRedirect(page.get_absolute_url())
# Render a form to edit data for existing page
else:
form=WebPageForm(instance=page)
return render_to_response('edit_page.html',{'form':form,},context_instance=RequestContext(request))
def add_page(request,full_slug=None):
parent_page=None
slug=None
if full_slug:
slug=full_slug.split('/')
# If there is a slug in REQUEST(ex.'trance')->we need to add a new_page to her parent_page.
# So get a parent page.
if slug:
if len(slug)>1:
parent_page=get_object_or_404(WebPage,slug=slug[-1])
elif len(slug)==1:
parent_page=get_object_or_404(WebPage,slug=slug[0])
# Create a new_page
if request.method=='POST':
form=WebPageForm(request.POST)
if form.is_valid():
new_page=form.save(commit=False)
if parent_page:
new_page.parent=parent_page
new_page.save()
return HttpResponseRedirect(new_page.get_absolute_url())
# Return an unbounded form
else:
form=WebPageForm()
return render_to_response('add_page.html',{'form':form,},context_instance=RequestContext(request))
The trick is in we have to check if the page really exists accessing to it via full_slug:
if not page.get_absolute_url().strip('/') == full_slug:
raise Http404
Otherwise, it could be wrong allowing to check only by slug.

There's also a django app that will do the work for you: django-mptt-urls

def get_absolute_url(self):
return '/'.join([x['slug'] for x in self.get_ancestors(include_self=True).values()])

Related

Scrapy isnt scraping the next page

I am trying to scrape article news from skynewsarabia.com
class SkyNewsSportsSpider(scrapy.Spider):
name = 'sky_news_sports'
sport = "https://www.skynewsarabia.com/sport/"
custom_settings = {
'FEED_EXPORT_FIELDS': ["article_content", "tags"],
}
allowed_domains = ['www.skynewsarabia.com']
first_token = "1569266773000"
scrape_this_link = "https://api.skynewsarabia.com//rest/v2/latest.json?defaultSectionId=6&nextPageToken={}&pageSize=20&types=ARTICLE"
start_urls = [scrape_this_link.format(first_token)]
urls = []
def parse(self, response):
articles = json.loads(response.text)
# to get the link for each article we need to combine both the id and the urlFriendlySuffix in one link
for article in range(0, len(articles["contentItems"])):
article_id = articles["contentItems"][article]["id"]
article_url = articles["contentItems"][article]["urlFriendlySuffix"]
relative_link = article_id + "-" + article_url
full_link = self.sport + relative_link
self.urls.append(full_link)
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse_details)
self.urls = []
print("Before Check")
self.first_token = articles["nextPageToken"]
if self.first_token is not None:
next_page = self.scrape_this_link.format(self.first_token)
print("I am inside!")
print(next_page)
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
pass
The basic idea here is that you first scrape a link which has 20 links. besides that, the first link has also a token for the next link which you need to add to the next URL so you can scrape the next 20 links. However, the problem I am facing is that when you first run the script, it is taking the next token and get all the links of that token and then it stops! so I am just scraping 20 links only! when I print the first_token it's giving me something different than 1569266773000 which is provided by default in the script.
You need to change allowed_domains = ['www.skynewsarabia.com'] to allowed_domains = ['skynewsarabia.com']. Alternatively remove the allowed_domains variable completely.
Since you have specified the hostname www Scrapy filters the requests to api.skynewsarabia.com as offsite and the calls are just being dropped.
Additional tip: Try to use self.logger.info and self.logger.debug instead of the print commands in your code.

How to stop the Crawler

I am trying to write a crawler that goes to a website and searches for a list of keywords, with max_Depth of 2. But the scraper is supposed to stop once any of the keyword's appears on any page, the problem i am facing right now is that the crawler does-not stop when it first see's any of the keywords.
Even after trying, early return command, break command and CloseSpider Commands and even python exit commands.
My class of the Crawler:
class WebsiteSpider(CrawlSpider):
name = "webcrawler"
allowed_domains = ["www.roomtoread.org"]
start_urls = ["https://"+"www.roomtoread.org"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
wordlist = [
"sfdc",
"pardot",
"Web-to-Lead",
"salesforce"
]
url = response.url
contenttype = response.headers.get("content-type", "").decode('utf-8').lower()
data = response.body.decode('utf-8')
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
if self.__class__.words_found==0:
self.__class__.words_found += 1
print(word + "," + url + ";")
STOP!
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want it to stop execution when if not ok: is True.
When I want to stop a spider, I usually use the exception exception scrapy.exceptions.CloseSpider(reason='cancelled') from Scrapy-Docs.
The example there shows how you can use it:
if 'Bandwidth exceeded' in response.body:
raise CloseSpider('bandwidth_exceeded')
In your case something like
if not ok:
raise CloseSpider('keyword_found')
Or is that what you meant with
CloseSpider Commands
and already tried it?

How to scrape multiple URLs with same parse using Scrapy?

Hi I am having a problem regarding my spider script, I wanted to make my script readable as possible and I wanted to save code as much as possible. Is it possible to use same parse on different URL?
I wanted to scrape 10 items per page only and save it on different items function in items.py
Here's my code
def start_requests(self): #I have 3 URL's Here
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/6/posts', self.parse) #Url 1
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/7/posts', self.parse) #Url 2
yield scrapy.Request('https://teslamotorsclub.com/tmc/post-ratings/1/posts', self.parse) #Url 3
def parse(self, response): #My logic is something like this
if Url == Url1:
item = TmcnfSpiderItem()
elif Url == Url2:
item = TmcnfSpiderItem2()
elif Url == Url3:
item = TmcnfSpiderItem3()
if count <= 9:
count += 1
info = response.css("[id^='fc-post-" + postno_only +"']")
author = info.xpath("#data-author").extract_first()
item['author'] = author
yield item
else:
#Move to next URL and perform same parse
Any idea?
I think you can try to pass all data from start_requests, like here:
def start_requests(self):
urls = (
('https://teslamotorsclub.com/tmc/post-ratings/6/posts', TmcnfSpiderItem),
('https://teslamotorsclub.com/tmc/post-ratings/7/posts', TmcnfSpiderItem2),
('https://teslamotorsclub.com/tmc/post-ratings/1/posts', TmcnfSpiderItem3),
)
for url, itemclass in urls:
yield scrapy.Request(url, meta={'itemclass': itemclass})
def parse(self, response):
item = response.meta['itemclass']()
So you pass your item classname for each url, and in parse function create new element of this class.

Flask-WTF: Queries of FormFields in FieldList are none after validate_on_submit

I'm trying to generate dynamic forms using Flask-WTF to create a new product based on some templates. A product will have a list of required key-value pairs based on its type, as well as a list of parts required to build it. The current relevant code looks as follows:
forms.py:
class PartSelectionForm(Form):
selected_part = QuerySelectField('Part', get_label='serial', allow_blank=True)
part_type = StringField('Type')
slot = IntegerField('Slot')
required = BooleanField('Required')
def __init__(self, csrf_enabled=False, *args, **kwargs):
super(PartSelectionForm, self).__init__(csrf_enabled=False, *args, **kwargs)
class NewProductForm(Form):
serial = StringField('Serial', default='', validators=[DataRequired()])
notes = TextAreaField('Notes', default='')
parts = FieldList(FormField(PartSelectionForm))
views.py:
#app.route('/products/new/<prodmodel>', methods=['GET', 'POST'])
#login_required
def new_product(prodmodel):
try:
model = db.session.query(ProdModel).filter(ProdModel.id==prodmodel).one()
except NoResultFound, e:
flash('No products of model type -' + prodmodel + '- found.', 'error')
return redirect(url_for('index'))
keys = db.session.query(ProdTypeTemplate.prod_info_key).filter(ProdTypeTemplate.prod_type_id==model.prod_type_id)\
.order_by(ProdTypeTemplate.prod_info_key).all()
parts_needed = db.session.query(ProdModelTemplate).filter(ProdModelTemplate.prod_model_id==prodmodel)\
.order_by(ProdModelTemplate.part_type_id, ProdModelTemplate.slot).all()
class F(forms.NewProductForm):
pass
for key in keys:
if key.prod_info_key in ['shipped_os','factory_os']:
setattr(F, key.prod_info_key, forms.QuerySelectField(key.prod_info_key, get_label='version'))
else:
setattr(F, key.prod_info_key, forms.StringField(key.prod_info_key, validators=[forms.DataRequired()]))
form = F(request.form)
if request.method == 'GET':
for part in parts_needed:
entry = form.parts.append_entry(forms.PartSelectionForm())
entry.part_type.data=part.part_type_id
entry.slot.data=slot=part.slot
entry.required.data=part.required
entry.selected_part.query = db.session.query(Part).join(PartModel).filter(PartModel.part_type_id==part.part_type_id, Part.status=='inventory')
if form.__contains__('shipped_os'):
form.shipped_os.query = db.session.query(OSVersion).order_by(OSVersion.version)
if form.__contains__('factory_os'):
form.factory_os.query = db.session.query(OSVersion).order_by(OSVersion.version)
if form.validate_on_submit():
...
Everything works as expected on a GET request, but on the validate_on_submit I get errors. The error is that all of the queries and query_factories for the selected_part QuerySelectFields in the list of PartSelectionForms is none, causing either direct errors in WTForms validation code or when Jinja2 attempts to re-render the QuerySelectFields. I'm not sure why this happens on the POST when everything appears to be correct for the GET.
I realized that although I set the required queries on a GET I'm not doing it for any PartSelectionForm selected_part entries on the POST. Since I already intended part_type, slot, and required to be hidden form fields, I added the following immediately before the validate_on_submit and everything works correctly:
for entry in form.parts:
entry.selected_part.query = db.session.query(Part).join(PartModel).\
filter(PartModel.part_type_id==entry.part_type.data, Part.status=='inventory')

Django page & category names showing as objects

This code is showing page and category names as objects and not by their respective title. Its supposed to show the names and its showing page objects and category objects instead for all the titles
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
import django
django.setup()
from rango.models import Category, Page
def populate():
python_cat = add_cat('Python')
add_page(cat=python_cat,
title="Official Python Tutorial",
url="http://docs.python.org/2/tutorial/")
add_page(cat=python_cat,
title="How to Think like a Computer Scientist",
url="http://www.greenteapress.com/thinkpython/")
add_page(cat=python_cat,
title="Learn Python in 10 Minutes",
url="http://www.korokithakis.net/tutorials/python/")
django_cat = add_cat("Django")
add_page(cat=django_cat,
title="Official Django Tutorial",
url="https://docs.djangoproject.com/en/1.5/intro/tutorial01/")
add_page(cat=django_cat,
title="Django Rocks",
url="http://www.djangorocks.com/")
add_page(cat=django_cat,
title="How to Tango with Django",
url="http://www.tangowithdjango.com/")
frame_cat = add_cat("Other Frameworks")
add_page(cat=frame_cat,
title="Bottle",
url="http://bottlepy.org/docs/dev/")
add_page(cat=frame_cat,
title="Flask",
url="http://flask.pocoo.org")
# Print out what we have added to the user.
for c in Category.objects.all():
for p in Page.objects.filter(category=c):
print ("- {0} - {1}".format(str(c), str(p)))
def add_page(cat, title, url, views=0):
p = Page.objects.get_or_create(category=cat, title=title, url=url, views=views)[0]
return p
def add_cat(name):
c = Category.objects.get_or_create(name=name)[0]
return c
# Start execution here!
if __name__ == '__main__':
print ("Starting Rango population script...")
populate()
What is wrong with the code or is the fault is with another file ? Using python 3.4 and django 1.7. Have I missed a file ? is there an other file I should share?
I had the same problem while studying through django with tango. I missed the method in the rango/modules.py in the class Page.
def str(self): # For Python 2, use unicode too
return self.title
__unicode__(self) seems to be creating the problem for Python-3 . Replace that with __str__(self) and it should work.
from django.db import models
class Category(models.Model):
name = models.CharField(max_length=128, unique=True)
def __str__(self):
return self.name
class Page(models.Model):
category = models.ForeignKey(Category)
title = models.CharField(max_length = 128)
url = models.URLField()
views = models.IntegerField(default=0)
def __str__(self):
return self.title

Resources