I am not getting any output for this python code - web-scraping

from bs4 import BeautifulSoup
import requests
import os
url = requests.get("https://www.pexels.com/search/flower/")
soup = BeautifulSoup(url.text, "html.parser")
links = []
x = soup.select('img[src^="https://images.pexels.com/photos"]')
for img in x:
links.append(img['src'])
for l in links:
print(l)

I recommend using the selenium web driver to get all page sources then parse them.
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.pexels.com/search/flower/"
options = webdriver.FirefoxOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.headless = True
driver = webdriver.Firefox(executable_path="./geckodriver", options=options)
driver.get(url)
content = driver.page_source
driver.quit()
soup = BeautifulSoup(content, "html.parser")
links = []
x = soup.select('img[src^="https://images.pexels.com/photos"]')
for img in x:
links.append(img['src'])
for l in links:
print(l)
The last version of geckodriver here.
I got the following output:
https://images.pexels.com/photos/36753/flower-purple-lical-blosso.jpg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/3860667/pexels-photo-3860667.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/133472/pexels-photo-133472.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4618416/pexels-photo-4618416.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4234543/pexels-photo-4234543.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
https://images.pexels.com/photos/4492525/pexels-photo-4492525.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4210784/pexels-photo-4210784.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
https://images.pexels.com/photos/4210781/pexels-photo-4210781.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500

Related

Beautiful Soap Scraping Content

Is there a way to get the number (13) at the very end?
I tried below code:
url='https://mgm.gov.tr/?il=Ankara'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_page = urlopen(req).read()
soup = BeautifulSoup(web_page, 'html.parser')
mydivs = soup.find_all("div", {"class": "tahminMax"})[0]
mydivs
and received following output:
<div class="tahminMax"><span class="deger" ng-bind="gunlukTahmin[0].enYuksekGun1 | kaliteKontrol"></span><span class="derece">°C</span></div>
Values are retrieved dynamically from another xhr call you can find in the network tab. You can extract them as follows:
import requests
headers = {'Origin': 'https://mgm.gov.tr'}
r = requests.get('https://servis.mgm.gov.tr/web/tahminler/saatlik?istno=17130', headers=headers).json()
d = {i['tarih']:i['maksimumRuzgarHizi'] for i in r[0]['tahmin']}
print(d)
The site is handled by JS events which loaded after site loading. Below you can achieve your goal using selenium.
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
url = 'https://mgm.gov.tr/?il=Ankara'
sada = browser.get(url)
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger ng-binding'}):
print(span.text)
browser.close()
also here's BeautifulSoup doing the task but the output of 13 will not be loaded.
from bs4 import BeautifulSoup
import requests
r = requests.get('https://mgm.gov.tr/?il=Ankara')
time.sleep(3)
soup = BeautifulSoup(r.text, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger', 'ng-bind': True}):
print(span.text)

Cannot find the desired link for download (Python BeautifulSoup)

I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
"""
Scrapping of all county-level raw data from
http://www.countyhealthrankings.org for all years. Data stored in RawData
folder.
Code modified from https://null-byte.wonderhowto.com/how-to/download-all-
pdfs-webpage-with-python-script-0163031/
"""
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
"""
Get all links
"""
def getAllLinks(url):
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
getAllLinks("http://www.countyhealthrankings.org/app/iowa/2017/downloads")
download(links)
(By the way, my desired link looks like this.)
Thanks!
This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('http://www.countyhealthrankings.org/app/iowa/2017/downloads')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
30
>>> for link in links:
... link.get_attribute('href')
...
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2017_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Iowa%20Data%20-%20v1.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2016_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2015_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2014_IA_v2.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Iowa%20Data%20-%20v6.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2013_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Iowa%20Data%20-%20v1_0.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2012_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2011_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2010_IA_0.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2010%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'

Scraping YouTube playlist video links

I wanted to download all videos of this Youtube channel. So I tried to write a script with BeautifulSoup to scrape all the links of the videos.
I did some inspection and found out that the "tr class="pl-video yt-uix-tile" can be used to get the links. This is the Python code:
import urllib2
from bs4 import BeautifulSoup
url='https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5'
html=urllib2.urlopen(url)
response=html.read()
soup=BeautifulSoup(response)
res=soup.find_all('tr',class_="pl-video yt-uix-tile ")
print res
But I am not able to get all the links. The output is empty. What can be done to resolve this?
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5')
page = r.text
soup=bs(page,'html.parser')
res=soup.find_all('a',{'class':'pl-video-title-link'})
for l in res:
print l.get("href")
It is giving me output as below in pycharm:
/watch?v=SUOWNXGRc6g&index=1&list=PL3D7BFF1DDBDAAFE5
/watch?v=857zrsYZKGo&index=2&list=PL3D7BFF1DDBDAAFE5
/watch?v=Da1jlmwuW_w&index=3&list=PL3D7BFF1DDBDAAFE5
/watch?v=MIKl8PX838E&index=4&list=PL3D7BFF1DDBDAAFE5
/watch?v=sPFUTJgvVpQ&index=5&list=PL3D7BFF1DDBDAAFE5
/watch?v=maYFI5O6P-8&index=6&list=PL3D7BFF1DDBDAAFE5
/watch?v=6moe-rLZKCk&index=7&list=PL3D7BFF1DDBDAAFE5
/watch?v=eKXnQ83RU3I&index=8&list=PL3D7BFF1DDBDAAFE5
/watch?v=WjE-pWYElsE&index=9&list=PL3D7BFF1DDBDAAFE5
/watch?v=hUA_isgpTHI&index=10&list=PL3D7BFF1DDBDAAFE5
/watch?v=IHg_0HJ5iQo&index=11&list=PL3D7BFF1DDBDAAFE5
/watch?v=H92G3CpSQf4&index=12&list=PL3D7BFF1DDBDAAFE5
/watch?v=B5uJeno3xg8&index=13&list=PL3D7BFF1DDBDAAFE5
/watch?v=hy0mRoT1ZlM&index=14&list=PL3D7BFF1DDBDAAFE5
/watch?v=Xpkbu2GrJpE&index=15&list=PL3D7BFF1DDBDAAFE5
/watch?v=-G91Hp3t6sg&index=16&list=PL3D7BFF1DDBDAAFE5
/watch?v=-zGS_zrL0rY&index=17&list=PL3D7BFF1DDBDAAFE5
/watch?v=4LHIESO0NGk&index=18&list=PL3D7BFF1DDBDAAFE5
/watch?v=8kybpxIixRk&index=19&list=PL3D7BFF1DDBDAAFE5
/watch?v=eHh2Yib7u-A&index=20&list=PL3D7BFF1DDBDAAFE5
/watch?v=zjHYyAJQ7Vw&index=21&list=PL3D7BFF1DDBDAAFE5
/watch?v=ma8aUC-Mf5M&index=22&list=PL3D7BFF1DDBDAAFE5
/watch?v=4MnuiIKCqsQ&index=23&list=PL3D7BFF1DDBDAAFE5
/watch?v=gz6P2E9lkfo&index=24&list=PL3D7BFF1DDBDAAFE5
/watch?v=roulejuE6B8&index=25&list=PL3D7BFF1DDBDAAFE5
/watch?v=NyusGsXc6SQ&index=26&list=PL3D7BFF1DDBDAAFE5
/watch?v=_joTj5XTwuQ&index=27&list=PL3D7BFF1DDBDAAFE5
/watch?v=55G47PgDwkY&index=28&list=PL3D7BFF1DDBDAAFE5
/watch?v=0MkeTcH0SPc&index=29&list=PL3D7BFF1DDBDAAFE5
/watch?v=QjQg8NkHGbw&index=30&list=PL3D7BFF1DDBDAAFE5
/watch?v=2CuTy8SA5kU&index=31&list=PL3D7BFF1DDBDAAFE5
/watch?v=MC2WFgZIZjo&index=32&list=PL3D7BFF1DDBDAAFE5
/watch?v=G_MkSpfKIPA&index=33&list=PL3D7BFF1DDBDAAFE5
/watch?v=Krt3g9HhhZ4&index=34&list=PL3D7BFF1DDBDAAFE5
/watch?v=lIwTbp5N7Hw&index=35&list=PL3D7BFF1DDBDAAFE5
/watch?v=geB8FqcUjo8&index=36&list=PL3D7BFF1DDBDAAFE5
/watch?v=Sqk154QSe8Y&index=37&list=PL3D7BFF1DDBDAAFE5
/watch?v=nq3yUjZGj5c&index=38&list=PL3D7BFF1DDBDAAFE5
/watch?v=8yA0vkjREyI&index=39&list=PL3D7BFF1DDBDAAFE5
/watch?v=AlC_Z5w8nDE&index=40&list=PL3D7BFF1DDBDAAFE5
/watch?v=2jduTfdt8RY&index=41&list=PL3D7BFF1DDBDAAFE5
/watch?v=6AoBM110DAY&index=42&list=PL3D7BFF1DDBDAAFE5
/watch?v=n6xhAVcopYU&index=43&list=PL3D7BFF1DDBDAAFE5
/watch?v=P2tNi1tS0xU&index=44&list=PL3D7BFF1DDBDAAFE5
/watch?v=AEA1qJFpheY&index=45&list=PL3D7BFF1DDBDAAFE5
/watch?v=iA2Efmo2PCA&index=46&list=PL3D7BFF1DDBDAAFE5
/watch?v=0-NTc0ezXes&index=47&list=PL3D7BFF1DDBDAAFE5
/watch?v=jbUQyJdf2P8&index=48&list=PL3D7BFF1DDBDAAFE5
/watch?v=zJ9qzvOOjAM&index=49&list=PL3D7BFF1DDBDAAFE5
/watch?v=wRa5Q2Eloa4&index=50&list=PL3D7BFF1DDBDAAFE5
/watch?v=Df129IGl31I&index=51&list=PL3D7BFF1DDBDAAFE5
/watch?v=SGx03Uqn9JA&index=52&list=PL3D7BFF1DDBDAAFE5
/watch?v=oaNus5QigYA&index=53&list=PL3D7BFF1DDBDAAFE5
/watch?v=fV3cpnNPWo0&index=54&list=PL3D7BFF1DDBDAAFE5
/watch?v=zXXCFmfJMNw&index=55&list=PL3D7BFF1DDBDAAFE5
/watch?v=iFoaqeEtTNU&index=56&list=PL3D7BFF1DDBDAAFE5
/watch?v=UlOM-CUlsBc&index=57&list=PL3D7BFF1DDBDAAFE5
/watch?v=XzTSdfLJt04&index=58&list=PL3D7BFF1DDBDAAFE5
/watch?v=iMe4fW31jMs&index=59&list=PL3D7BFF1DDBDAAFE5
/watch?v=BlKDYBqlfgs&index=60&list=PL3D7BFF1DDBDAAFE5
/watch?v=kOJGmVXuuFA&index=61&list=PL3D7BFF1DDBDAAFE5
/watch?v=wUmId0rwsBQ&index=62&list=PL3D7BFF1DDBDAAFE5
/watch?v=0wy907WZFiA&index=63&list=PL3D7BFF1DDBDAAFE5
/watch?v=ZMcYbf9Hhe4&index=64&list=PL3D7BFF1DDBDAAFE5
/watch?v=yowNavIDzzE&index=65&list=PL3D7BFF1DDBDAAFE5
/watch?v=cJUsL7sc1E8&index=66&list=PL3D7BFF1DDBDAAFE5
/watch?v=Od3xkrxcsE8&index=67&list=PL3D7BFF1DDBDAAFE5
/watch?v=iZMNaPgP4Ak&index=68&list=PL3D7BFF1DDBDAAFE5
/watch?v=PmOtvJqDfqY&index=69&list=PL3D7BFF1DDBDAAFE5
/watch?v=ulFq_0x29sI&index=70&list=PL3D7BFF1DDBDAAFE5
/watch?v=Dmq_WGhJbgI&index=71&list=PL3D7BFF1DDBDAAFE5
/watch?v=S36C23lW5qI&index=72&list=PL3D7BFF1DDBDAAFE5
/watch?v=3r9NGjBvv2w&index=73&list=PL3D7BFF1DDBDAAFE5
/watch?v=ioGWpu8Ud7A&index=74&list=PL3D7BFF1DDBDAAFE5
/watch?v=K7YuusyEvOg&index=75&list=PL3D7BFF1DDBDAAFE5
/watch?v=3OhGkg_XT3o&index=76&list=PL3D7BFF1DDBDAAFE5
/watch?v=G8QK452ynr4&index=77&list=PL3D7BFF1DDBDAAFE5
/watch?v=XVPCXNoiYIg&index=78&list=PL3D7BFF1DDBDAAFE5
/watch?v=m1AeMJux0Zo&index=79&list=PL3D7BFF1DDBDAAFE5
/watch?v=o5LrdSQrWEI&index=80&list=PL3D7BFF1DDBDAAFE5
/watch?v=NcKSFlYEqYY&index=81&list=PL3D7BFF1DDBDAAFE5
/watch?v=N2Tx8S2V8ek&index=82&list=PL3D7BFF1DDBDAAFE5
/watch?v=Iy3wCppq2Yc&index=83&list=PL3D7BFF1DDBDAAFE5
/watch?v=lkadcYQ6SuY&index=84&list=PL3D7BFF1DDBDAAFE5
/watch?v=PQ94MmEg0Qw&index=85&list=PL3D7BFF1DDBDAAFE5
/watch?v=DqNzTaf9g5w&index=86&list=PL3D7BFF1DDBDAAFE5
/watch?v=BWGW8UsO4Hc&index=87&list=PL3D7BFF1DDBDAAFE5
/watch?v=b4MYh6N4z6s&index=88&list=PL3D7BFF1DDBDAAFE5
/watch?v=9xGIlaezMAU&index=89&list=PL3D7BFF1DDBDAAFE5
/watch?v=UC2wAuxECw0&index=90&list=PL3D7BFF1DDBDAAFE5
/watch?v=zRqcoUSbMI0&index=91&list=PL3D7BFF1DDBDAAFE5
/watch?v=D2iMtK8ETGs&index=92&list=PL3D7BFF1DDBDAAFE5
/watch?v=PJL8UChOsSk&index=93&list=PL3D7BFF1DDBDAAFE5
/watch?v=QbD6qwxiEUU&index=94&list=PL3D7BFF1DDBDAAFE5
/watch?v=-ZbdfYleuJU&index=95&list=PL3D7BFF1DDBDAAFE5
/watch?v=JVaGZwuYmck&index=96&list=PL3D7BFF1DDBDAAFE5
/watch?v=5pr7jwYF0JU&index=97&list=PL3D7BFF1DDBDAAFE5
/watch?v=MNCAmgFHcOI&index=98&list=PL3D7BFF1DDBDAAFE5
/watch?v=tXR0AlhNYxQ&index=99&list=PL3D7BFF1DDBDAAFE5
/watch?v=GtWXOzsD5Fw&index=100&list=PL3D7BFF1DDBDAAFE5
And below output on cmd:
/watch?v=SUOWNXGRc6g&list=PL3D7BFF1DDBDAAFE5&index=1
/watch?v=857zrsYZKGo&list=PL3D7BFF1DDBDAAFE5&index=2
/watch?v=Da1jlmwuW_w&list=PL3D7BFF1DDBDAAFE5&index=3
/watch?v=MIKl8PX838E&list=PL3D7BFF1DDBDAAFE5&index=4
/watch?v=sPFUTJgvVpQ&list=PL3D7BFF1DDBDAAFE5&index=5
/watch?v=maYFI5O6P-8&list=PL3D7BFF1DDBDAAFE5&index=6
/watch?v=6moe-rLZKCk&list=PL3D7BFF1DDBDAAFE5&index=7
/watch?v=eKXnQ83RU3I&list=PL3D7BFF1DDBDAAFE5&index=8
/watch?v=WjE-pWYElsE&list=PL3D7BFF1DDBDAAFE5&index=9
/watch?v=hUA_isgpTHI&list=PL3D7BFF1DDBDAAFE5&index=10
/watch?v=IHg_0HJ5iQo&list=PL3D7BFF1DDBDAAFE5&index=11
/watch?v=H92G3CpSQf4&list=PL3D7BFF1DDBDAAFE5&index=12
/watch?v=B5uJeno3xg8&list=PL3D7BFF1DDBDAAFE5&index=13
/watch?v=hy0mRoT1ZlM&list=PL3D7BFF1DDBDAAFE5&index=14
/watch?v=Xpkbu2GrJpE&list=PL3D7BFF1DDBDAAFE5&index=15
/watch?v=-G91Hp3t6sg&list=PL3D7BFF1DDBDAAFE5&index=16
/watch?v=-zGS_zrL0rY&list=PL3D7BFF1DDBDAAFE5&index=17
/watch?v=4LHIESO0NGk&list=PL3D7BFF1DDBDAAFE5&index=18
/watch?v=8kybpxIixRk&list=PL3D7BFF1DDBDAAFE5&index=19
/watch?v=eHh2Yib7u-A&list=PL3D7BFF1DDBDAAFE5&index=20
/watch?v=zjHYyAJQ7Vw&list=PL3D7BFF1DDBDAAFE5&index=21
/watch?v=ma8aUC-Mf5M&list=PL3D7BFF1DDBDAAFE5&index=22
/watch?v=4MnuiIKCqsQ&list=PL3D7BFF1DDBDAAFE5&index=23
/watch?v=gz6P2E9lkfo&list=PL3D7BFF1DDBDAAFE5&index=24
/watch?v=roulejuE6B8&list=PL3D7BFF1DDBDAAFE5&index=25
/watch?v=NyusGsXc6SQ&list=PL3D7BFF1DDBDAAFE5&index=26
/watch?v=_joTj5XTwuQ&list=PL3D7BFF1DDBDAAFE5&index=27
/watch?v=55G47PgDwkY&list=PL3D7BFF1DDBDAAFE5&index=28
/watch?v=0MkeTcH0SPc&list=PL3D7BFF1DDBDAAFE5&index=29
/watch?v=QjQg8NkHGbw&list=PL3D7BFF1DDBDAAFE5&index=30
/watch?v=2CuTy8SA5kU&list=PL3D7BFF1DDBDAAFE5&index=31
/watch?v=MC2WFgZIZjo&list=PL3D7BFF1DDBDAAFE5&index=32
/watch?v=G_MkSpfKIPA&list=PL3D7BFF1DDBDAAFE5&index=33
/watch?v=Krt3g9HhhZ4&list=PL3D7BFF1DDBDAAFE5&index=34
/watch?v=lIwTbp5N7Hw&list=PL3D7BFF1DDBDAAFE5&index=35
/watch?v=geB8FqcUjo8&list=PL3D7BFF1DDBDAAFE5&index=36
/watch?v=Sqk154QSe8Y&list=PL3D7BFF1DDBDAAFE5&index=37
/watch?v=nq3yUjZGj5c&list=PL3D7BFF1DDBDAAFE5&index=38
/watch?v=8yA0vkjREyI&list=PL3D7BFF1DDBDAAFE5&index=39
/watch?v=AlC_Z5w8nDE&list=PL3D7BFF1DDBDAAFE5&index=40
/watch?v=2jduTfdt8RY&list=PL3D7BFF1DDBDAAFE5&index=41
/watch?v=6AoBM110DAY&list=PL3D7BFF1DDBDAAFE5&index=42
/watch?v=n6xhAVcopYU&list=PL3D7BFF1DDBDAAFE5&index=43
/watch?v=P2tNi1tS0xU&list=PL3D7BFF1DDBDAAFE5&index=44
/watch?v=AEA1qJFpheY&list=PL3D7BFF1DDBDAAFE5&index=45
/watch?v=iA2Efmo2PCA&list=PL3D7BFF1DDBDAAFE5&index=46
/watch?v=0-NTc0ezXes&list=PL3D7BFF1DDBDAAFE5&index=47
/watch?v=jbUQyJdf2P8&list=PL3D7BFF1DDBDAAFE5&index=48
/watch?v=zJ9qzvOOjAM&list=PL3D7BFF1DDBDAAFE5&index=49
/watch?v=wRa5Q2Eloa4&list=PL3D7BFF1DDBDAAFE5&index=50
/watch?v=Df129IGl31I&list=PL3D7BFF1DDBDAAFE5&index=51
/watch?v=SGx03Uqn9JA&list=PL3D7BFF1DDBDAAFE5&index=52
/watch?v=oaNus5QigYA&list=PL3D7BFF1DDBDAAFE5&index=53
/watch?v=fV3cpnNPWo0&list=PL3D7BFF1DDBDAAFE5&index=54
/watch?v=zXXCFmfJMNw&list=PL3D7BFF1DDBDAAFE5&index=55
/watch?v=iFoaqeEtTNU&list=PL3D7BFF1DDBDAAFE5&index=56
/watch?v=UlOM-CUlsBc&list=PL3D7BFF1DDBDAAFE5&index=57
/watch?v=XzTSdfLJt04&list=PL3D7BFF1DDBDAAFE5&index=58
/watch?v=iMe4fW31jMs&list=PL3D7BFF1DDBDAAFE5&index=59
/watch?v=BlKDYBqlfgs&list=PL3D7BFF1DDBDAAFE5&index=60
/watch?v=kOJGmVXuuFA&list=PL3D7BFF1DDBDAAFE5&index=61
/watch?v=wUmId0rwsBQ&list=PL3D7BFF1DDBDAAFE5&index=62
/watch?v=0wy907WZFiA&list=PL3D7BFF1DDBDAAFE5&index=63
/watch?v=ZMcYbf9Hhe4&list=PL3D7BFF1DDBDAAFE5&index=64
/watch?v=yowNavIDzzE&list=PL3D7BFF1DDBDAAFE5&index=65
/watch?v=cJUsL7sc1E8&list=PL3D7BFF1DDBDAAFE5&index=66
/watch?v=Od3xkrxcsE8&list=PL3D7BFF1DDBDAAFE5&index=67
/watch?v=iZMNaPgP4Ak&list=PL3D7BFF1DDBDAAFE5&index=68
/watch?v=PmOtvJqDfqY&list=PL3D7BFF1DDBDAAFE5&index=69
/watch?v=ulFq_0x29sI&list=PL3D7BFF1DDBDAAFE5&index=70
/watch?v=Dmq_WGhJbgI&list=PL3D7BFF1DDBDAAFE5&index=71
/watch?v=S36C23lW5qI&list=PL3D7BFF1DDBDAAFE5&index=72
/watch?v=3r9NGjBvv2w&list=PL3D7BFF1DDBDAAFE5&index=73
/watch?v=ioGWpu8Ud7A&list=PL3D7BFF1DDBDAAFE5&index=74
/watch?v=K7YuusyEvOg&list=PL3D7BFF1DDBDAAFE5&index=75
/watch?v=3OhGkg_XT3o&list=PL3D7BFF1DDBDAAFE5&index=76
/watch?v=G8QK452ynr4&list=PL3D7BFF1DDBDAAFE5&index=77
/watch?v=XVPCXNoiYIg&list=PL3D7BFF1DDBDAAFE5&index=78
/watch?v=m1AeMJux0Zo&list=PL3D7BFF1DDBDAAFE5&index=79
/watch?v=o5LrdSQrWEI&list=PL3D7BFF1DDBDAAFE5&index=80
/watch?v=NcKSFlYEqYY&list=PL3D7BFF1DDBDAAFE5&index=81
/watch?v=N2Tx8S2V8ek&list=PL3D7BFF1DDBDAAFE5&index=82
/watch?v=Iy3wCppq2Yc&list=PL3D7BFF1DDBDAAFE5&index=83
/watch?v=lkadcYQ6SuY&list=PL3D7BFF1DDBDAAFE5&index=84
/watch?v=PQ94MmEg0Qw&list=PL3D7BFF1DDBDAAFE5&index=85
/watch?v=DqNzTaf9g5w&list=PL3D7BFF1DDBDAAFE5&index=86
/watch?v=BWGW8UsO4Hc&list=PL3D7BFF1DDBDAAFE5&index=87
/watch?v=b4MYh6N4z6s&list=PL3D7BFF1DDBDAAFE5&index=88
/watch?v=9xGIlaezMAU&list=PL3D7BFF1DDBDAAFE5&index=89
/watch?v=UC2wAuxECw0&list=PL3D7BFF1DDBDAAFE5&index=90
/watch?v=zRqcoUSbMI0&list=PL3D7BFF1DDBDAAFE5&index=91
/watch?v=D2iMtK8ETGs&list=PL3D7BFF1DDBDAAFE5&index=92
/watch?v=PJL8UChOsSk&list=PL3D7BFF1DDBDAAFE5&index=93
/watch?v=QbD6qwxiEUU&list=PL3D7BFF1DDBDAAFE5&index=94
/watch?v=-ZbdfYleuJU&list=PL3D7BFF1DDBDAAFE5&index=95
/watch?v=JVaGZwuYmck&list=PL3D7BFF1DDBDAAFE5&index=96
/watch?v=5pr7jwYF0JU&list=PL3D7BFF1DDBDAAFE5&index=97
/watch?v=MNCAmgFHcOI&list=PL3D7BFF1DDBDAAFE5&index=98
/watch?v=tXR0AlhNYxQ&list=PL3D7BFF1DDBDAAFE5&index=99
/watch?v=GtWXOzsD5Fw&list=PL3D7BFF1DDBDAAFE5&index=100
I am using Python 2.7.12 and beautifulsoup4
Try this one:
import urllib2
from bs4 import BeautifulSoup
htmlParser = "lxml"
url='https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5'
html=urllib2.urlopen(url)
response=html.read()
soup=BeautifulSoup(response, htmlParser)
links = soup.find_all('a', attrs={'class':'pl-video-title-link'})
for a in links:
print(a.get("href"))
Output:
/watch?v=SUOWNXGRc6g&list=PL3D7BFF1DDBDAAFE5&index=1
/watch?v=857zrsYZKGo&list=PL3D7BFF1DDBDAAFE5&index=2
/watch?v=Da1jlmwuW_w&list=PL3D7BFF1DDBDAAFE5&index=3
/watch?v=MIKl8PX838E&list=PL3D7BFF1DDBDAAFE5&index=4
/watch?v=sPFUTJgvVpQ&list=PL3D7BFF1DDBDAAFE5&index=5
/watch?v=maYFI5O6P-8&list=PL3D7BFF1DDBDAAFE5&index=6
/watch?v=6moe-rLZKCk&list=PL3D7BFF1DDBDAAFE5&index=7
/watch?v=eKXnQ83RU3I&list=PL3D7BFF1DDBDAAFE5&index=8
/watch?v=WjE-pWYElsE&list=PL3D7BFF1DDBDAAFE5&index=9
/watch?v=hUA_isgpTHI&list=PL3D7BFF1DDBDAAFE5&index=10
/watch?v=IHg_0HJ5iQo&list=PL3D7BFF1DDBDAAFE5&index=11
/watch?v=H92G3CpSQf4&list=PL3D7BFF1DDBDAAFE5&index=12
...
Try This-
from bs4 import BeautifulSoup
import requests
def getPlaylistLinks(url):
sourceCode = requests.get(url).text
soup = BeautifulSoup(sourceCode, 'html.parser')
domain = 'https://www.youtube.com'
for link in soup.find_all("a", {"dir": "ltr"}):
href = link.get('href')
if href.startswith('/watch?'):
print(link.string.strip())
print(domain + href + '\n')
getPlaylistLinks('Your URL')

Webcrawler not working with HTTPS

I am having an issue with my web crawler. It can run through any regular old website like a charm, but when it runs into a https protocol it doesn't seem to work.
This is the error I am getting when I try to run a https url through my crawler (name 'htmltext' is not defined)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse
#SourceUrl
url = "https://en.wikipedia.org/wiki/Main_Page"
urls = [url]
z = urlparse(urls[0])
TopLevel = z.scheme+'://'+z.netloc
visited =[url]
robotsUrl = TopLevel +'/robots.txt'
while len(urls) < 100:
try:
htmltext = urllib.request.urlopen(urls[0]).read()
robots = urllib.request.urlopen(robotsUrl).read()
disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
except:
print (urls[0])
sourceCode = BeautifulSoup(htmltext, "html.parser")
urls.pop(0)
print(len(urls))
for link in sourceCode.findAll('a', href=True):
if "http://" not in link['href']:
link['href'] = urllib.parse.urljoin(url,link['href'])
in_disallow = False
for i in range(len(disallowList)):
if (disallowList[i]).upper().decode() in link['href'].upper():
in_disallow = True
break
if not in_disallow:
if link['href'] not in visited:
urls.append(link['href'])
visited.append(link['href'])
print (visited)

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Resources