Webcrawler not working with HTTPS - http

I am having an issue with my web crawler. It can run through any regular old website like a charm, but when it runs into a https protocol it doesn't seem to work.
This is the error I am getting when I try to run a https url through my crawler (name 'htmltext' is not defined)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse
#SourceUrl
url = "https://en.wikipedia.org/wiki/Main_Page"
urls = [url]
z = urlparse(urls[0])
TopLevel = z.scheme+'://'+z.netloc
visited =[url]
robotsUrl = TopLevel +'/robots.txt'
while len(urls) < 100:
try:
htmltext = urllib.request.urlopen(urls[0]).read()
robots = urllib.request.urlopen(robotsUrl).read()
disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
except:
print (urls[0])
sourceCode = BeautifulSoup(htmltext, "html.parser")
urls.pop(0)
print(len(urls))
for link in sourceCode.findAll('a', href=True):
if "http://" not in link['href']:
link['href'] = urllib.parse.urljoin(url,link['href'])
in_disallow = False
for i in range(len(disallowList)):
if (disallowList[i]).upper().decode() in link['href'].upper():
in_disallow = True
break
if not in_disallow:
if link['href'] not in visited:
urls.append(link['href'])
visited.append(link['href'])
print (visited)

Related

I am not getting any output for this python code

from bs4 import BeautifulSoup
import requests
import os
url = requests.get("https://www.pexels.com/search/flower/")
soup = BeautifulSoup(url.text, "html.parser")
links = []
x = soup.select('img[src^="https://images.pexels.com/photos"]')
for img in x:
links.append(img['src'])
for l in links:
print(l)
I recommend using the selenium web driver to get all page sources then parse them.
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.pexels.com/search/flower/"
options = webdriver.FirefoxOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.headless = True
driver = webdriver.Firefox(executable_path="./geckodriver", options=options)
driver.get(url)
content = driver.page_source
driver.quit()
soup = BeautifulSoup(content, "html.parser")
links = []
x = soup.select('img[src^="https://images.pexels.com/photos"]')
for img in x:
links.append(img['src'])
for l in links:
print(l)
The last version of geckodriver here.
I got the following output:
https://images.pexels.com/photos/36753/flower-purple-lical-blosso.jpg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/3860667/pexels-photo-3860667.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/133472/pexels-photo-133472.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4618416/pexels-photo-4618416.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4234543/pexels-photo-4234543.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
https://images.pexels.com/photos/4492525/pexels-photo-4492525.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4210784/pexels-photo-4210784.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4210781/pexels-photo-4210781.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500

Beautiful Soap Scraping Content

Is there a way to get the number (13) at the very end?
I tried below code:
url='https://mgm.gov.tr/?il=Ankara'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_page = urlopen(req).read()
soup = BeautifulSoup(web_page, 'html.parser')
mydivs = soup.find_all("div", {"class": "tahminMax"})[0]
mydivs
and received following output:
<div class="tahminMax"><span class="deger" ng-bind="gunlukTahmin[0].enYuksekGun1 | kaliteKontrol"></span><span class="derece">°C</span></div>
Values are retrieved dynamically from another xhr call you can find in the network tab. You can extract them as follows:
import requests
headers = {'Origin': 'https://mgm.gov.tr'}
r = requests.get('https://servis.mgm.gov.tr/web/tahminler/saatlik?istno=17130', headers=headers).json()
d = {i['tarih']:i['maksimumRuzgarHizi'] for i in r[0]['tahmin']}
print(d)
The site is handled by JS events which loaded after site loading. Below you can achieve your goal using selenium.
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
url = 'https://mgm.gov.tr/?il=Ankara'
sada = browser.get(url)
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger ng-binding'}):
print(span.text)
browser.close()
also here's BeautifulSoup doing the task but the output of 13 will not be loaded.
from bs4 import BeautifulSoup
import requests
r = requests.get('https://mgm.gov.tr/?il=Ankara')
time.sleep(3)
soup = BeautifulSoup(r.text, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger', 'ng-bind': True}):
print(span.text)

import start_urls from a csv file in Scrapy

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
print(site.extract())
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
items.append(item)
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
%FF%FEw%00w%00w%00.%00r%00e%00a%00l%00t%00o%00r%00.%00c%00o%00m%00/%00r%00e%00a%00l%00e%00s%00t%00a%00t%00e%00a%00n%00d%00h%00o%00m%00e%00s%00-%00d%00e%00t%00a%00i%00l%00/%005%000%00-%00M%00e%00n%00o%00r%00e%00s%00-%00A%00v%00e%00-%00A%00p%00t%00-%006%001%000%00_%00C%00o%00r%00a%00l%00-%00G%00a%00b%00l%00e%00s%00_%00F%00L%00_%003%003%001%003%004%00_%00M%005%003%008%000%006%00-%005%008%006%007%007%00%0D%00
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,
"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

Scraping YouTube playlist video links

I wanted to download all videos of this Youtube channel. So I tried to write a script with BeautifulSoup to scrape all the links of the videos.
I did some inspection and found out that the "tr class="pl-video yt-uix-tile" can be used to get the links. This is the Python code:
import urllib2
from bs4 import BeautifulSoup
url='https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5'
html=urllib2.urlopen(url)
response=html.read()
soup=BeautifulSoup(response)
res=soup.find_all('tr',class_="pl-video yt-uix-tile ")
print res
But I am not able to get all the links. The output is empty. What can be done to resolve this?
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5')
page = r.text
soup=bs(page,'html.parser')
res=soup.find_all('a',{'class':'pl-video-title-link'})
for l in res:
print l.get("href")
It is giving me output as below in pycharm:
/watch?v=SUOWNXGRc6g&index=1&list=PL3D7BFF1DDBDAAFE5
/watch?v=857zrsYZKGo&index=2&list=PL3D7BFF1DDBDAAFE5
/watch?v=Da1jlmwuW_w&index=3&list=PL3D7BFF1DDBDAAFE5
/watch?v=MIKl8PX838E&index=4&list=PL3D7BFF1DDBDAAFE5
/watch?v=sPFUTJgvVpQ&index=5&list=PL3D7BFF1DDBDAAFE5
/watch?v=maYFI5O6P-8&index=6&list=PL3D7BFF1DDBDAAFE5
/watch?v=6moe-rLZKCk&index=7&list=PL3D7BFF1DDBDAAFE5
/watch?v=eKXnQ83RU3I&index=8&list=PL3D7BFF1DDBDAAFE5
/watch?v=WjE-pWYElsE&index=9&list=PL3D7BFF1DDBDAAFE5
/watch?v=hUA_isgpTHI&index=10&list=PL3D7BFF1DDBDAAFE5
/watch?v=IHg_0HJ5iQo&index=11&list=PL3D7BFF1DDBDAAFE5
/watch?v=H92G3CpSQf4&index=12&list=PL3D7BFF1DDBDAAFE5
/watch?v=B5uJeno3xg8&index=13&list=PL3D7BFF1DDBDAAFE5
/watch?v=hy0mRoT1ZlM&index=14&list=PL3D7BFF1DDBDAAFE5
/watch?v=Xpkbu2GrJpE&index=15&list=PL3D7BFF1DDBDAAFE5
/watch?v=-G91Hp3t6sg&index=16&list=PL3D7BFF1DDBDAAFE5
/watch?v=-zGS_zrL0rY&index=17&list=PL3D7BFF1DDBDAAFE5
/watch?v=4LHIESO0NGk&index=18&list=PL3D7BFF1DDBDAAFE5
/watch?v=8kybpxIixRk&index=19&list=PL3D7BFF1DDBDAAFE5
/watch?v=eHh2Yib7u-A&index=20&list=PL3D7BFF1DDBDAAFE5
/watch?v=zjHYyAJQ7Vw&index=21&list=PL3D7BFF1DDBDAAFE5
/watch?v=ma8aUC-Mf5M&index=22&list=PL3D7BFF1DDBDAAFE5
/watch?v=4MnuiIKCqsQ&index=23&list=PL3D7BFF1DDBDAAFE5
/watch?v=gz6P2E9lkfo&index=24&list=PL3D7BFF1DDBDAAFE5
/watch?v=roulejuE6B8&index=25&list=PL3D7BFF1DDBDAAFE5
/watch?v=NyusGsXc6SQ&index=26&list=PL3D7BFF1DDBDAAFE5
/watch?v=_joTj5XTwuQ&index=27&list=PL3D7BFF1DDBDAAFE5
/watch?v=55G47PgDwkY&index=28&list=PL3D7BFF1DDBDAAFE5
/watch?v=0MkeTcH0SPc&index=29&list=PL3D7BFF1DDBDAAFE5
/watch?v=QjQg8NkHGbw&index=30&list=PL3D7BFF1DDBDAAFE5
/watch?v=2CuTy8SA5kU&index=31&list=PL3D7BFF1DDBDAAFE5
/watch?v=MC2WFgZIZjo&index=32&list=PL3D7BFF1DDBDAAFE5
/watch?v=G_MkSpfKIPA&index=33&list=PL3D7BFF1DDBDAAFE5
/watch?v=Krt3g9HhhZ4&index=34&list=PL3D7BFF1DDBDAAFE5
/watch?v=lIwTbp5N7Hw&index=35&list=PL3D7BFF1DDBDAAFE5
/watch?v=geB8FqcUjo8&index=36&list=PL3D7BFF1DDBDAAFE5
/watch?v=Sqk154QSe8Y&index=37&list=PL3D7BFF1DDBDAAFE5
/watch?v=nq3yUjZGj5c&index=38&list=PL3D7BFF1DDBDAAFE5
/watch?v=8yA0vkjREyI&index=39&list=PL3D7BFF1DDBDAAFE5
/watch?v=AlC_Z5w8nDE&index=40&list=PL3D7BFF1DDBDAAFE5
/watch?v=2jduTfdt8RY&index=41&list=PL3D7BFF1DDBDAAFE5
/watch?v=6AoBM110DAY&index=42&list=PL3D7BFF1DDBDAAFE5
/watch?v=n6xhAVcopYU&index=43&list=PL3D7BFF1DDBDAAFE5
/watch?v=P2tNi1tS0xU&index=44&list=PL3D7BFF1DDBDAAFE5
/watch?v=AEA1qJFpheY&index=45&list=PL3D7BFF1DDBDAAFE5
/watch?v=iA2Efmo2PCA&index=46&list=PL3D7BFF1DDBDAAFE5
/watch?v=0-NTc0ezXes&index=47&list=PL3D7BFF1DDBDAAFE5
/watch?v=jbUQyJdf2P8&index=48&list=PL3D7BFF1DDBDAAFE5
/watch?v=zJ9qzvOOjAM&index=49&list=PL3D7BFF1DDBDAAFE5
/watch?v=wRa5Q2Eloa4&index=50&list=PL3D7BFF1DDBDAAFE5
/watch?v=Df129IGl31I&index=51&list=PL3D7BFF1DDBDAAFE5
/watch?v=SGx03Uqn9JA&index=52&list=PL3D7BFF1DDBDAAFE5
/watch?v=oaNus5QigYA&index=53&list=PL3D7BFF1DDBDAAFE5
/watch?v=fV3cpnNPWo0&index=54&list=PL3D7BFF1DDBDAAFE5
/watch?v=zXXCFmfJMNw&index=55&list=PL3D7BFF1DDBDAAFE5
/watch?v=iFoaqeEtTNU&index=56&list=PL3D7BFF1DDBDAAFE5
/watch?v=UlOM-CUlsBc&index=57&list=PL3D7BFF1DDBDAAFE5
/watch?v=XzTSdfLJt04&index=58&list=PL3D7BFF1DDBDAAFE5
/watch?v=iMe4fW31jMs&index=59&list=PL3D7BFF1DDBDAAFE5
/watch?v=BlKDYBqlfgs&index=60&list=PL3D7BFF1DDBDAAFE5
/watch?v=kOJGmVXuuFA&index=61&list=PL3D7BFF1DDBDAAFE5
/watch?v=wUmId0rwsBQ&index=62&list=PL3D7BFF1DDBDAAFE5
/watch?v=0wy907WZFiA&index=63&list=PL3D7BFF1DDBDAAFE5
/watch?v=ZMcYbf9Hhe4&index=64&list=PL3D7BFF1DDBDAAFE5
/watch?v=yowNavIDzzE&index=65&list=PL3D7BFF1DDBDAAFE5
/watch?v=cJUsL7sc1E8&index=66&list=PL3D7BFF1DDBDAAFE5
/watch?v=Od3xkrxcsE8&index=67&list=PL3D7BFF1DDBDAAFE5
/watch?v=iZMNaPgP4Ak&index=68&list=PL3D7BFF1DDBDAAFE5
/watch?v=PmOtvJqDfqY&index=69&list=PL3D7BFF1DDBDAAFE5
/watch?v=ulFq_0x29sI&index=70&list=PL3D7BFF1DDBDAAFE5
/watch?v=Dmq_WGhJbgI&index=71&list=PL3D7BFF1DDBDAAFE5
/watch?v=S36C23lW5qI&index=72&list=PL3D7BFF1DDBDAAFE5
/watch?v=3r9NGjBvv2w&index=73&list=PL3D7BFF1DDBDAAFE5
/watch?v=ioGWpu8Ud7A&index=74&list=PL3D7BFF1DDBDAAFE5
/watch?v=K7YuusyEvOg&index=75&list=PL3D7BFF1DDBDAAFE5
/watch?v=3OhGkg_XT3o&index=76&list=PL3D7BFF1DDBDAAFE5
/watch?v=G8QK452ynr4&index=77&list=PL3D7BFF1DDBDAAFE5
/watch?v=XVPCXNoiYIg&index=78&list=PL3D7BFF1DDBDAAFE5
/watch?v=m1AeMJux0Zo&index=79&list=PL3D7BFF1DDBDAAFE5
/watch?v=o5LrdSQrWEI&index=80&list=PL3D7BFF1DDBDAAFE5
/watch?v=NcKSFlYEqYY&index=81&list=PL3D7BFF1DDBDAAFE5
/watch?v=N2Tx8S2V8ek&index=82&list=PL3D7BFF1DDBDAAFE5
/watch?v=Iy3wCppq2Yc&index=83&list=PL3D7BFF1DDBDAAFE5
/watch?v=lkadcYQ6SuY&index=84&list=PL3D7BFF1DDBDAAFE5
/watch?v=PQ94MmEg0Qw&index=85&list=PL3D7BFF1DDBDAAFE5
/watch?v=DqNzTaf9g5w&index=86&list=PL3D7BFF1DDBDAAFE5
/watch?v=BWGW8UsO4Hc&index=87&list=PL3D7BFF1DDBDAAFE5
/watch?v=b4MYh6N4z6s&index=88&list=PL3D7BFF1DDBDAAFE5
/watch?v=9xGIlaezMAU&index=89&list=PL3D7BFF1DDBDAAFE5
/watch?v=UC2wAuxECw0&index=90&list=PL3D7BFF1DDBDAAFE5
/watch?v=zRqcoUSbMI0&index=91&list=PL3D7BFF1DDBDAAFE5
/watch?v=D2iMtK8ETGs&index=92&list=PL3D7BFF1DDBDAAFE5
/watch?v=PJL8UChOsSk&index=93&list=PL3D7BFF1DDBDAAFE5
/watch?v=QbD6qwxiEUU&index=94&list=PL3D7BFF1DDBDAAFE5
/watch?v=-ZbdfYleuJU&index=95&list=PL3D7BFF1DDBDAAFE5
/watch?v=JVaGZwuYmck&index=96&list=PL3D7BFF1DDBDAAFE5
/watch?v=5pr7jwYF0JU&index=97&list=PL3D7BFF1DDBDAAFE5
/watch?v=MNCAmgFHcOI&index=98&list=PL3D7BFF1DDBDAAFE5
/watch?v=tXR0AlhNYxQ&index=99&list=PL3D7BFF1DDBDAAFE5
/watch?v=GtWXOzsD5Fw&index=100&list=PL3D7BFF1DDBDAAFE5
And below output on cmd:
/watch?v=SUOWNXGRc6g&list=PL3D7BFF1DDBDAAFE5&index=1
/watch?v=857zrsYZKGo&list=PL3D7BFF1DDBDAAFE5&index=2
/watch?v=Da1jlmwuW_w&list=PL3D7BFF1DDBDAAFE5&index=3
/watch?v=MIKl8PX838E&list=PL3D7BFF1DDBDAAFE5&index=4
/watch?v=sPFUTJgvVpQ&list=PL3D7BFF1DDBDAAFE5&index=5
/watch?v=maYFI5O6P-8&list=PL3D7BFF1DDBDAAFE5&index=6
/watch?v=6moe-rLZKCk&list=PL3D7BFF1DDBDAAFE5&index=7
/watch?v=eKXnQ83RU3I&list=PL3D7BFF1DDBDAAFE5&index=8
/watch?v=WjE-pWYElsE&list=PL3D7BFF1DDBDAAFE5&index=9
/watch?v=hUA_isgpTHI&list=PL3D7BFF1DDBDAAFE5&index=10
/watch?v=IHg_0HJ5iQo&list=PL3D7BFF1DDBDAAFE5&index=11
/watch?v=H92G3CpSQf4&list=PL3D7BFF1DDBDAAFE5&index=12
/watch?v=B5uJeno3xg8&list=PL3D7BFF1DDBDAAFE5&index=13
/watch?v=hy0mRoT1ZlM&list=PL3D7BFF1DDBDAAFE5&index=14
/watch?v=Xpkbu2GrJpE&list=PL3D7BFF1DDBDAAFE5&index=15
/watch?v=-G91Hp3t6sg&list=PL3D7BFF1DDBDAAFE5&index=16
/watch?v=-zGS_zrL0rY&list=PL3D7BFF1DDBDAAFE5&index=17
/watch?v=4LHIESO0NGk&list=PL3D7BFF1DDBDAAFE5&index=18
/watch?v=8kybpxIixRk&list=PL3D7BFF1DDBDAAFE5&index=19
/watch?v=eHh2Yib7u-A&list=PL3D7BFF1DDBDAAFE5&index=20
/watch?v=zjHYyAJQ7Vw&list=PL3D7BFF1DDBDAAFE5&index=21
/watch?v=ma8aUC-Mf5M&list=PL3D7BFF1DDBDAAFE5&index=22
/watch?v=4MnuiIKCqsQ&list=PL3D7BFF1DDBDAAFE5&index=23
/watch?v=gz6P2E9lkfo&list=PL3D7BFF1DDBDAAFE5&index=24
/watch?v=roulejuE6B8&list=PL3D7BFF1DDBDAAFE5&index=25
/watch?v=NyusGsXc6SQ&list=PL3D7BFF1DDBDAAFE5&index=26
/watch?v=_joTj5XTwuQ&list=PL3D7BFF1DDBDAAFE5&index=27
/watch?v=55G47PgDwkY&list=PL3D7BFF1DDBDAAFE5&index=28
/watch?v=0MkeTcH0SPc&list=PL3D7BFF1DDBDAAFE5&index=29
/watch?v=QjQg8NkHGbw&list=PL3D7BFF1DDBDAAFE5&index=30
/watch?v=2CuTy8SA5kU&list=PL3D7BFF1DDBDAAFE5&index=31
/watch?v=MC2WFgZIZjo&list=PL3D7BFF1DDBDAAFE5&index=32
/watch?v=G_MkSpfKIPA&list=PL3D7BFF1DDBDAAFE5&index=33
/watch?v=Krt3g9HhhZ4&list=PL3D7BFF1DDBDAAFE5&index=34
/watch?v=lIwTbp5N7Hw&list=PL3D7BFF1DDBDAAFE5&index=35
/watch?v=geB8FqcUjo8&list=PL3D7BFF1DDBDAAFE5&index=36
/watch?v=Sqk154QSe8Y&list=PL3D7BFF1DDBDAAFE5&index=37
/watch?v=nq3yUjZGj5c&list=PL3D7BFF1DDBDAAFE5&index=38
/watch?v=8yA0vkjREyI&list=PL3D7BFF1DDBDAAFE5&index=39
/watch?v=AlC_Z5w8nDE&list=PL3D7BFF1DDBDAAFE5&index=40
/watch?v=2jduTfdt8RY&list=PL3D7BFF1DDBDAAFE5&index=41
/watch?v=6AoBM110DAY&list=PL3D7BFF1DDBDAAFE5&index=42
/watch?v=n6xhAVcopYU&list=PL3D7BFF1DDBDAAFE5&index=43
/watch?v=P2tNi1tS0xU&list=PL3D7BFF1DDBDAAFE5&index=44
/watch?v=AEA1qJFpheY&list=PL3D7BFF1DDBDAAFE5&index=45
/watch?v=iA2Efmo2PCA&list=PL3D7BFF1DDBDAAFE5&index=46
/watch?v=0-NTc0ezXes&list=PL3D7BFF1DDBDAAFE5&index=47
/watch?v=jbUQyJdf2P8&list=PL3D7BFF1DDBDAAFE5&index=48
/watch?v=zJ9qzvOOjAM&list=PL3D7BFF1DDBDAAFE5&index=49
/watch?v=wRa5Q2Eloa4&list=PL3D7BFF1DDBDAAFE5&index=50
/watch?v=Df129IGl31I&list=PL3D7BFF1DDBDAAFE5&index=51
/watch?v=SGx03Uqn9JA&list=PL3D7BFF1DDBDAAFE5&index=52
/watch?v=oaNus5QigYA&list=PL3D7BFF1DDBDAAFE5&index=53
/watch?v=fV3cpnNPWo0&list=PL3D7BFF1DDBDAAFE5&index=54
/watch?v=zXXCFmfJMNw&list=PL3D7BFF1DDBDAAFE5&index=55
/watch?v=iFoaqeEtTNU&list=PL3D7BFF1DDBDAAFE5&index=56
/watch?v=UlOM-CUlsBc&list=PL3D7BFF1DDBDAAFE5&index=57
/watch?v=XzTSdfLJt04&list=PL3D7BFF1DDBDAAFE5&index=58
/watch?v=iMe4fW31jMs&list=PL3D7BFF1DDBDAAFE5&index=59
/watch?v=BlKDYBqlfgs&list=PL3D7BFF1DDBDAAFE5&index=60
/watch?v=kOJGmVXuuFA&list=PL3D7BFF1DDBDAAFE5&index=61
/watch?v=wUmId0rwsBQ&list=PL3D7BFF1DDBDAAFE5&index=62
/watch?v=0wy907WZFiA&list=PL3D7BFF1DDBDAAFE5&index=63
/watch?v=ZMcYbf9Hhe4&list=PL3D7BFF1DDBDAAFE5&index=64
/watch?v=yowNavIDzzE&list=PL3D7BFF1DDBDAAFE5&index=65
/watch?v=cJUsL7sc1E8&list=PL3D7BFF1DDBDAAFE5&index=66
/watch?v=Od3xkrxcsE8&list=PL3D7BFF1DDBDAAFE5&index=67
/watch?v=iZMNaPgP4Ak&list=PL3D7BFF1DDBDAAFE5&index=68
/watch?v=PmOtvJqDfqY&list=PL3D7BFF1DDBDAAFE5&index=69
/watch?v=ulFq_0x29sI&list=PL3D7BFF1DDBDAAFE5&index=70
/watch?v=Dmq_WGhJbgI&list=PL3D7BFF1DDBDAAFE5&index=71
/watch?v=S36C23lW5qI&list=PL3D7BFF1DDBDAAFE5&index=72
/watch?v=3r9NGjBvv2w&list=PL3D7BFF1DDBDAAFE5&index=73
/watch?v=ioGWpu8Ud7A&list=PL3D7BFF1DDBDAAFE5&index=74
/watch?v=K7YuusyEvOg&list=PL3D7BFF1DDBDAAFE5&index=75
/watch?v=3OhGkg_XT3o&list=PL3D7BFF1DDBDAAFE5&index=76
/watch?v=G8QK452ynr4&list=PL3D7BFF1DDBDAAFE5&index=77
/watch?v=XVPCXNoiYIg&list=PL3D7BFF1DDBDAAFE5&index=78
/watch?v=m1AeMJux0Zo&list=PL3D7BFF1DDBDAAFE5&index=79
/watch?v=o5LrdSQrWEI&list=PL3D7BFF1DDBDAAFE5&index=80
/watch?v=NcKSFlYEqYY&list=PL3D7BFF1DDBDAAFE5&index=81
/watch?v=N2Tx8S2V8ek&list=PL3D7BFF1DDBDAAFE5&index=82
/watch?v=Iy3wCppq2Yc&list=PL3D7BFF1DDBDAAFE5&index=83
/watch?v=lkadcYQ6SuY&list=PL3D7BFF1DDBDAAFE5&index=84
/watch?v=PQ94MmEg0Qw&list=PL3D7BFF1DDBDAAFE5&index=85
/watch?v=DqNzTaf9g5w&list=PL3D7BFF1DDBDAAFE5&index=86
/watch?v=BWGW8UsO4Hc&list=PL3D7BFF1DDBDAAFE5&index=87
/watch?v=b4MYh6N4z6s&list=PL3D7BFF1DDBDAAFE5&index=88
/watch?v=9xGIlaezMAU&list=PL3D7BFF1DDBDAAFE5&index=89
/watch?v=UC2wAuxECw0&list=PL3D7BFF1DDBDAAFE5&index=90
/watch?v=zRqcoUSbMI0&list=PL3D7BFF1DDBDAAFE5&index=91
/watch?v=D2iMtK8ETGs&list=PL3D7BFF1DDBDAAFE5&index=92
/watch?v=PJL8UChOsSk&list=PL3D7BFF1DDBDAAFE5&index=93
/watch?v=QbD6qwxiEUU&list=PL3D7BFF1DDBDAAFE5&index=94
/watch?v=-ZbdfYleuJU&list=PL3D7BFF1DDBDAAFE5&index=95
/watch?v=JVaGZwuYmck&list=PL3D7BFF1DDBDAAFE5&index=96
/watch?v=5pr7jwYF0JU&list=PL3D7BFF1DDBDAAFE5&index=97
/watch?v=MNCAmgFHcOI&list=PL3D7BFF1DDBDAAFE5&index=98
/watch?v=tXR0AlhNYxQ&list=PL3D7BFF1DDBDAAFE5&index=99
/watch?v=GtWXOzsD5Fw&list=PL3D7BFF1DDBDAAFE5&index=100
I am using Python 2.7.12 and beautifulsoup4
Try this one:
import urllib2
from bs4 import BeautifulSoup
htmlParser = "lxml"
url='https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5'
html=urllib2.urlopen(url)
response=html.read()
soup=BeautifulSoup(response, htmlParser)
links = soup.find_all('a', attrs={'class':'pl-video-title-link'})
for a in links:
print(a.get("href"))
Output:
/watch?v=SUOWNXGRc6g&list=PL3D7BFF1DDBDAAFE5&index=1
/watch?v=857zrsYZKGo&list=PL3D7BFF1DDBDAAFE5&index=2
/watch?v=Da1jlmwuW_w&list=PL3D7BFF1DDBDAAFE5&index=3
/watch?v=MIKl8PX838E&list=PL3D7BFF1DDBDAAFE5&index=4
/watch?v=sPFUTJgvVpQ&list=PL3D7BFF1DDBDAAFE5&index=5
/watch?v=maYFI5O6P-8&list=PL3D7BFF1DDBDAAFE5&index=6
/watch?v=6moe-rLZKCk&list=PL3D7BFF1DDBDAAFE5&index=7
/watch?v=eKXnQ83RU3I&list=PL3D7BFF1DDBDAAFE5&index=8
/watch?v=WjE-pWYElsE&list=PL3D7BFF1DDBDAAFE5&index=9
/watch?v=hUA_isgpTHI&list=PL3D7BFF1DDBDAAFE5&index=10
/watch?v=IHg_0HJ5iQo&list=PL3D7BFF1DDBDAAFE5&index=11
/watch?v=H92G3CpSQf4&list=PL3D7BFF1DDBDAAFE5&index=12
...
Try This-
from bs4 import BeautifulSoup
import requests
def getPlaylistLinks(url):
sourceCode = requests.get(url).text
soup = BeautifulSoup(sourceCode, 'html.parser')
domain = 'https://www.youtube.com'
for link in soup.find_all("a", {"dir": "ltr"}):
href = link.get('href')
if href.startswith('/watch?'):
print(link.string.strip())
print(domain + href + '\n')
getPlaylistLinks('Your URL')

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Resources