Cannot find the desired link for download (Python BeautifulSoup) - web-scraping

I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
"""
Scrapping of all county-level raw data from
http://www.countyhealthrankings.org for all years. Data stored in RawData
folder.
Code modified from https://null-byte.wonderhowto.com/how-to/download-all-
pdfs-webpage-with-python-script-0163031/
"""
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
"""
Get all links
"""
def getAllLinks(url):
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
getAllLinks("http://www.countyhealthrankings.org/app/iowa/2017/downloads")
download(links)
(By the way, my desired link looks like this.)
Thanks!

This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('http://www.countyhealthrankings.org/app/iowa/2017/downloads')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
30
>>> for link in links:
... link.get_attribute('href')
...
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2017_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Iowa%20Data%20-%20v1.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2016_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2015_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2014_IA_v2.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Iowa%20Data%20-%20v6.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2013_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Iowa%20Data%20-%20v1_0.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2012_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2011_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2010_IA_0.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2010%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'

Related

Beautiful Soap Scraping Content

Is there a way to get the number (13) at the very end?
I tried below code:
url='https://mgm.gov.tr/?il=Ankara'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_page = urlopen(req).read()
soup = BeautifulSoup(web_page, 'html.parser')
mydivs = soup.find_all("div", {"class": "tahminMax"})[0]
mydivs
and received following output:
<div class="tahminMax"><span class="deger" ng-bind="gunlukTahmin[0].enYuksekGun1 | kaliteKontrol"></span><span class="derece">°C</span></div>
Values are retrieved dynamically from another xhr call you can find in the network tab. You can extract them as follows:
import requests
headers = {'Origin': 'https://mgm.gov.tr'}
r = requests.get('https://servis.mgm.gov.tr/web/tahminler/saatlik?istno=17130', headers=headers).json()
d = {i['tarih']:i['maksimumRuzgarHizi'] for i in r[0]['tahmin']}
print(d)
The site is handled by JS events which loaded after site loading. Below you can achieve your goal using selenium.
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
url = 'https://mgm.gov.tr/?il=Ankara'
sada = browser.get(url)
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger ng-binding'}):
print(span.text)
browser.close()
also here's BeautifulSoup doing the task but the output of 13 will not be loaded.
from bs4 import BeautifulSoup
import requests
r = requests.get('https://mgm.gov.tr/?il=Ankara')
time.sleep(3)
soup = BeautifulSoup(r.text, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger', 'ng-bind': True}):
print(span.text)

Find_by_xpath results with errors

I'm Bart and I am new into Python and this is my first post here.
As a fan of whisky I wanted to scrape some shops to give me recent deals on whisky, however, I stuck with Asda's page. I browsed here for ages but without any luck hence my post.
Thank you.
Browser is opening, and closing as expected.
below is my creation:
Import libraries
# import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
# import pandas as pd
# import requests
from selenium.webdriver.firefox.options import Options as FirefoxOptions
# specify url
#url = "https://groceries.asda.com/product/whisky/glenmorangie-the-original-single-malt-scotch-whisky/68303869"
url = "https://groceries.asda.com/search/whisky/1/relevance-desc/so-false/Type%3A3612046177%3AMalt%20Whisky"
# run webdriver with headless option
options = FirefoxOptions()
driver = webdriver.Firefox(options=options)
options.add_argument('--headless')
# get page
driver.get(url)
# execute script to scroll down the page
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;')
# sleep for 30s
time.sleep(30)
# close driver
driver.close()
# find element by xpath
results = driver.find_elements_by_xpath("//*[#id='componentsContainer']//*[#id='listingsContainer']//*[#class='product active']//*[#class='title productTitle']")
"""soup = BeautifulSoup(browser.page_source, 'html.parser')"""
print('Number of results', len(results))
Here is the output.
Traceback (most recent call last):
File "D:/PycharmProjects/Giraffe/asda.py", line 29, in <module>
results = driver.find_elements_by_xpath("//*[#id='componentsContainer']//*[#id='listingsContainer']//*[#class='product active']//*[#class='title productTitle']")
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 410, in find_elements_by_xpath
return self.find_elements(by=By.XPATH, value=xpath)
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 1007, in find_elements
'value': value})['value'] or []
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: Tried to run command without establishing a connection
Process finished with exit code 1
I tried to stick to the way you have already written. Do not go for hardcoded delay as that is always inconsistent. Try to opt for Explicit Wait. That said this is how you can get the result:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "https://groceries.asda.com/search/whisky"
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
driver.get(url)
item = wait.until(EC.presence_of_element_located((By.XPATH, "//h3[#class='co-product-list__title']")))
driver.execute_script("arguments[0].scrollIntoView();",item)
results = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//li[contains(#class,'co-item')]//*[#class='co-product__title']/a")))
print('Number of results:', len(results))
driver.quit()
Output:
Number of results: 61

import start_urls from a csv file in Scrapy

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = ["realtor.com"]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
print(site.extract())
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
items.append(item)
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
%FF%FEw%00w%00w%00.%00r%00e%00a%00l%00t%00o%00r%00.%00c%00o%00m%00/%00r%00e%00a%00l%00e%00s%00t%00a%00t%00e%00a%00n%00d%00h%00o%00m%00e%00s%00-%00d%00e%00t%00a%00i%00l%00/%005%000%00-%00M%00e%00n%00o%00r%00e%00s%00-%00A%00v%00e%00-%00A%00p%00t%00-%006%001%000%00_%00C%00o%00r%00a%00l%00-%00G%00a%00b%00l%00e%00s%00_%00F%00L%00_%003%003%001%003%004%00_%00M%005%003%008%000%006%00-%005%008%006%007%007%00%0D%00
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,
"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

Scraping YouTube playlist video links

I wanted to download all videos of this Youtube channel. So I tried to write a script with BeautifulSoup to scrape all the links of the videos.
I did some inspection and found out that the "tr class="pl-video yt-uix-tile" can be used to get the links. This is the Python code:
import urllib2
from bs4 import BeautifulSoup
url='https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5'
html=urllib2.urlopen(url)
response=html.read()
soup=BeautifulSoup(response)
res=soup.find_all('tr',class_="pl-video yt-uix-tile ")
print res
But I am not able to get all the links. The output is empty. What can be done to resolve this?
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5')
page = r.text
soup=bs(page,'html.parser')
res=soup.find_all('a',{'class':'pl-video-title-link'})
for l in res:
print l.get("href")
It is giving me output as below in pycharm:
/watch?v=SUOWNXGRc6g&index=1&list=PL3D7BFF1DDBDAAFE5
/watch?v=857zrsYZKGo&index=2&list=PL3D7BFF1DDBDAAFE5
/watch?v=Da1jlmwuW_w&index=3&list=PL3D7BFF1DDBDAAFE5
/watch?v=MIKl8PX838E&index=4&list=PL3D7BFF1DDBDAAFE5
/watch?v=sPFUTJgvVpQ&index=5&list=PL3D7BFF1DDBDAAFE5
/watch?v=maYFI5O6P-8&index=6&list=PL3D7BFF1DDBDAAFE5
/watch?v=6moe-rLZKCk&index=7&list=PL3D7BFF1DDBDAAFE5
/watch?v=eKXnQ83RU3I&index=8&list=PL3D7BFF1DDBDAAFE5
/watch?v=WjE-pWYElsE&index=9&list=PL3D7BFF1DDBDAAFE5
/watch?v=hUA_isgpTHI&index=10&list=PL3D7BFF1DDBDAAFE5
/watch?v=IHg_0HJ5iQo&index=11&list=PL3D7BFF1DDBDAAFE5
/watch?v=H92G3CpSQf4&index=12&list=PL3D7BFF1DDBDAAFE5
/watch?v=B5uJeno3xg8&index=13&list=PL3D7BFF1DDBDAAFE5
/watch?v=hy0mRoT1ZlM&index=14&list=PL3D7BFF1DDBDAAFE5
/watch?v=Xpkbu2GrJpE&index=15&list=PL3D7BFF1DDBDAAFE5
/watch?v=-G91Hp3t6sg&index=16&list=PL3D7BFF1DDBDAAFE5
/watch?v=-zGS_zrL0rY&index=17&list=PL3D7BFF1DDBDAAFE5
/watch?v=4LHIESO0NGk&index=18&list=PL3D7BFF1DDBDAAFE5
/watch?v=8kybpxIixRk&index=19&list=PL3D7BFF1DDBDAAFE5
/watch?v=eHh2Yib7u-A&index=20&list=PL3D7BFF1DDBDAAFE5
/watch?v=zjHYyAJQ7Vw&index=21&list=PL3D7BFF1DDBDAAFE5
/watch?v=ma8aUC-Mf5M&index=22&list=PL3D7BFF1DDBDAAFE5
/watch?v=4MnuiIKCqsQ&index=23&list=PL3D7BFF1DDBDAAFE5
/watch?v=gz6P2E9lkfo&index=24&list=PL3D7BFF1DDBDAAFE5
/watch?v=roulejuE6B8&index=25&list=PL3D7BFF1DDBDAAFE5
/watch?v=NyusGsXc6SQ&index=26&list=PL3D7BFF1DDBDAAFE5
/watch?v=_joTj5XTwuQ&index=27&list=PL3D7BFF1DDBDAAFE5
/watch?v=55G47PgDwkY&index=28&list=PL3D7BFF1DDBDAAFE5
/watch?v=0MkeTcH0SPc&index=29&list=PL3D7BFF1DDBDAAFE5
/watch?v=QjQg8NkHGbw&index=30&list=PL3D7BFF1DDBDAAFE5
/watch?v=2CuTy8SA5kU&index=31&list=PL3D7BFF1DDBDAAFE5
/watch?v=MC2WFgZIZjo&index=32&list=PL3D7BFF1DDBDAAFE5
/watch?v=G_MkSpfKIPA&index=33&list=PL3D7BFF1DDBDAAFE5
/watch?v=Krt3g9HhhZ4&index=34&list=PL3D7BFF1DDBDAAFE5
/watch?v=lIwTbp5N7Hw&index=35&list=PL3D7BFF1DDBDAAFE5
/watch?v=geB8FqcUjo8&index=36&list=PL3D7BFF1DDBDAAFE5
/watch?v=Sqk154QSe8Y&index=37&list=PL3D7BFF1DDBDAAFE5
/watch?v=nq3yUjZGj5c&index=38&list=PL3D7BFF1DDBDAAFE5
/watch?v=8yA0vkjREyI&index=39&list=PL3D7BFF1DDBDAAFE5
/watch?v=AlC_Z5w8nDE&index=40&list=PL3D7BFF1DDBDAAFE5
/watch?v=2jduTfdt8RY&index=41&list=PL3D7BFF1DDBDAAFE5
/watch?v=6AoBM110DAY&index=42&list=PL3D7BFF1DDBDAAFE5
/watch?v=n6xhAVcopYU&index=43&list=PL3D7BFF1DDBDAAFE5
/watch?v=P2tNi1tS0xU&index=44&list=PL3D7BFF1DDBDAAFE5
/watch?v=AEA1qJFpheY&index=45&list=PL3D7BFF1DDBDAAFE5
/watch?v=iA2Efmo2PCA&index=46&list=PL3D7BFF1DDBDAAFE5
/watch?v=0-NTc0ezXes&index=47&list=PL3D7BFF1DDBDAAFE5
/watch?v=jbUQyJdf2P8&index=48&list=PL3D7BFF1DDBDAAFE5
/watch?v=zJ9qzvOOjAM&index=49&list=PL3D7BFF1DDBDAAFE5
/watch?v=wRa5Q2Eloa4&index=50&list=PL3D7BFF1DDBDAAFE5
/watch?v=Df129IGl31I&index=51&list=PL3D7BFF1DDBDAAFE5
/watch?v=SGx03Uqn9JA&index=52&list=PL3D7BFF1DDBDAAFE5
/watch?v=oaNus5QigYA&index=53&list=PL3D7BFF1DDBDAAFE5
/watch?v=fV3cpnNPWo0&index=54&list=PL3D7BFF1DDBDAAFE5
/watch?v=zXXCFmfJMNw&index=55&list=PL3D7BFF1DDBDAAFE5
/watch?v=iFoaqeEtTNU&index=56&list=PL3D7BFF1DDBDAAFE5
/watch?v=UlOM-CUlsBc&index=57&list=PL3D7BFF1DDBDAAFE5
/watch?v=XzTSdfLJt04&index=58&list=PL3D7BFF1DDBDAAFE5
/watch?v=iMe4fW31jMs&index=59&list=PL3D7BFF1DDBDAAFE5
/watch?v=BlKDYBqlfgs&index=60&list=PL3D7BFF1DDBDAAFE5
/watch?v=kOJGmVXuuFA&index=61&list=PL3D7BFF1DDBDAAFE5
/watch?v=wUmId0rwsBQ&index=62&list=PL3D7BFF1DDBDAAFE5
/watch?v=0wy907WZFiA&index=63&list=PL3D7BFF1DDBDAAFE5
/watch?v=ZMcYbf9Hhe4&index=64&list=PL3D7BFF1DDBDAAFE5
/watch?v=yowNavIDzzE&index=65&list=PL3D7BFF1DDBDAAFE5
/watch?v=cJUsL7sc1E8&index=66&list=PL3D7BFF1DDBDAAFE5
/watch?v=Od3xkrxcsE8&index=67&list=PL3D7BFF1DDBDAAFE5
/watch?v=iZMNaPgP4Ak&index=68&list=PL3D7BFF1DDBDAAFE5
/watch?v=PmOtvJqDfqY&index=69&list=PL3D7BFF1DDBDAAFE5
/watch?v=ulFq_0x29sI&index=70&list=PL3D7BFF1DDBDAAFE5
/watch?v=Dmq_WGhJbgI&index=71&list=PL3D7BFF1DDBDAAFE5
/watch?v=S36C23lW5qI&index=72&list=PL3D7BFF1DDBDAAFE5
/watch?v=3r9NGjBvv2w&index=73&list=PL3D7BFF1DDBDAAFE5
/watch?v=ioGWpu8Ud7A&index=74&list=PL3D7BFF1DDBDAAFE5
/watch?v=K7YuusyEvOg&index=75&list=PL3D7BFF1DDBDAAFE5
/watch?v=3OhGkg_XT3o&index=76&list=PL3D7BFF1DDBDAAFE5
/watch?v=G8QK452ynr4&index=77&list=PL3D7BFF1DDBDAAFE5
/watch?v=XVPCXNoiYIg&index=78&list=PL3D7BFF1DDBDAAFE5
/watch?v=m1AeMJux0Zo&index=79&list=PL3D7BFF1DDBDAAFE5
/watch?v=o5LrdSQrWEI&index=80&list=PL3D7BFF1DDBDAAFE5
/watch?v=NcKSFlYEqYY&index=81&list=PL3D7BFF1DDBDAAFE5
/watch?v=N2Tx8S2V8ek&index=82&list=PL3D7BFF1DDBDAAFE5
/watch?v=Iy3wCppq2Yc&index=83&list=PL3D7BFF1DDBDAAFE5
/watch?v=lkadcYQ6SuY&index=84&list=PL3D7BFF1DDBDAAFE5
/watch?v=PQ94MmEg0Qw&index=85&list=PL3D7BFF1DDBDAAFE5
/watch?v=DqNzTaf9g5w&index=86&list=PL3D7BFF1DDBDAAFE5
/watch?v=BWGW8UsO4Hc&index=87&list=PL3D7BFF1DDBDAAFE5
/watch?v=b4MYh6N4z6s&index=88&list=PL3D7BFF1DDBDAAFE5
/watch?v=9xGIlaezMAU&index=89&list=PL3D7BFF1DDBDAAFE5
/watch?v=UC2wAuxECw0&index=90&list=PL3D7BFF1DDBDAAFE5
/watch?v=zRqcoUSbMI0&index=91&list=PL3D7BFF1DDBDAAFE5
/watch?v=D2iMtK8ETGs&index=92&list=PL3D7BFF1DDBDAAFE5
/watch?v=PJL8UChOsSk&index=93&list=PL3D7BFF1DDBDAAFE5
/watch?v=QbD6qwxiEUU&index=94&list=PL3D7BFF1DDBDAAFE5
/watch?v=-ZbdfYleuJU&index=95&list=PL3D7BFF1DDBDAAFE5
/watch?v=JVaGZwuYmck&index=96&list=PL3D7BFF1DDBDAAFE5
/watch?v=5pr7jwYF0JU&index=97&list=PL3D7BFF1DDBDAAFE5
/watch?v=MNCAmgFHcOI&index=98&list=PL3D7BFF1DDBDAAFE5
/watch?v=tXR0AlhNYxQ&index=99&list=PL3D7BFF1DDBDAAFE5
/watch?v=GtWXOzsD5Fw&index=100&list=PL3D7BFF1DDBDAAFE5
And below output on cmd:
/watch?v=SUOWNXGRc6g&list=PL3D7BFF1DDBDAAFE5&index=1
/watch?v=857zrsYZKGo&list=PL3D7BFF1DDBDAAFE5&index=2
/watch?v=Da1jlmwuW_w&list=PL3D7BFF1DDBDAAFE5&index=3
/watch?v=MIKl8PX838E&list=PL3D7BFF1DDBDAAFE5&index=4
/watch?v=sPFUTJgvVpQ&list=PL3D7BFF1DDBDAAFE5&index=5
/watch?v=maYFI5O6P-8&list=PL3D7BFF1DDBDAAFE5&index=6
/watch?v=6moe-rLZKCk&list=PL3D7BFF1DDBDAAFE5&index=7
/watch?v=eKXnQ83RU3I&list=PL3D7BFF1DDBDAAFE5&index=8
/watch?v=WjE-pWYElsE&list=PL3D7BFF1DDBDAAFE5&index=9
/watch?v=hUA_isgpTHI&list=PL3D7BFF1DDBDAAFE5&index=10
/watch?v=IHg_0HJ5iQo&list=PL3D7BFF1DDBDAAFE5&index=11
/watch?v=H92G3CpSQf4&list=PL3D7BFF1DDBDAAFE5&index=12
/watch?v=B5uJeno3xg8&list=PL3D7BFF1DDBDAAFE5&index=13
/watch?v=hy0mRoT1ZlM&list=PL3D7BFF1DDBDAAFE5&index=14
/watch?v=Xpkbu2GrJpE&list=PL3D7BFF1DDBDAAFE5&index=15
/watch?v=-G91Hp3t6sg&list=PL3D7BFF1DDBDAAFE5&index=16
/watch?v=-zGS_zrL0rY&list=PL3D7BFF1DDBDAAFE5&index=17
/watch?v=4LHIESO0NGk&list=PL3D7BFF1DDBDAAFE5&index=18
/watch?v=8kybpxIixRk&list=PL3D7BFF1DDBDAAFE5&index=19
/watch?v=eHh2Yib7u-A&list=PL3D7BFF1DDBDAAFE5&index=20
/watch?v=zjHYyAJQ7Vw&list=PL3D7BFF1DDBDAAFE5&index=21
/watch?v=ma8aUC-Mf5M&list=PL3D7BFF1DDBDAAFE5&index=22
/watch?v=4MnuiIKCqsQ&list=PL3D7BFF1DDBDAAFE5&index=23
/watch?v=gz6P2E9lkfo&list=PL3D7BFF1DDBDAAFE5&index=24
/watch?v=roulejuE6B8&list=PL3D7BFF1DDBDAAFE5&index=25
/watch?v=NyusGsXc6SQ&list=PL3D7BFF1DDBDAAFE5&index=26
/watch?v=_joTj5XTwuQ&list=PL3D7BFF1DDBDAAFE5&index=27
/watch?v=55G47PgDwkY&list=PL3D7BFF1DDBDAAFE5&index=28
/watch?v=0MkeTcH0SPc&list=PL3D7BFF1DDBDAAFE5&index=29
/watch?v=QjQg8NkHGbw&list=PL3D7BFF1DDBDAAFE5&index=30
/watch?v=2CuTy8SA5kU&list=PL3D7BFF1DDBDAAFE5&index=31
/watch?v=MC2WFgZIZjo&list=PL3D7BFF1DDBDAAFE5&index=32
/watch?v=G_MkSpfKIPA&list=PL3D7BFF1DDBDAAFE5&index=33
/watch?v=Krt3g9HhhZ4&list=PL3D7BFF1DDBDAAFE5&index=34
/watch?v=lIwTbp5N7Hw&list=PL3D7BFF1DDBDAAFE5&index=35
/watch?v=geB8FqcUjo8&list=PL3D7BFF1DDBDAAFE5&index=36
/watch?v=Sqk154QSe8Y&list=PL3D7BFF1DDBDAAFE5&index=37
/watch?v=nq3yUjZGj5c&list=PL3D7BFF1DDBDAAFE5&index=38
/watch?v=8yA0vkjREyI&list=PL3D7BFF1DDBDAAFE5&index=39
/watch?v=AlC_Z5w8nDE&list=PL3D7BFF1DDBDAAFE5&index=40
/watch?v=2jduTfdt8RY&list=PL3D7BFF1DDBDAAFE5&index=41
/watch?v=6AoBM110DAY&list=PL3D7BFF1DDBDAAFE5&index=42
/watch?v=n6xhAVcopYU&list=PL3D7BFF1DDBDAAFE5&index=43
/watch?v=P2tNi1tS0xU&list=PL3D7BFF1DDBDAAFE5&index=44
/watch?v=AEA1qJFpheY&list=PL3D7BFF1DDBDAAFE5&index=45
/watch?v=iA2Efmo2PCA&list=PL3D7BFF1DDBDAAFE5&index=46
/watch?v=0-NTc0ezXes&list=PL3D7BFF1DDBDAAFE5&index=47
/watch?v=jbUQyJdf2P8&list=PL3D7BFF1DDBDAAFE5&index=48
/watch?v=zJ9qzvOOjAM&list=PL3D7BFF1DDBDAAFE5&index=49
/watch?v=wRa5Q2Eloa4&list=PL3D7BFF1DDBDAAFE5&index=50
/watch?v=Df129IGl31I&list=PL3D7BFF1DDBDAAFE5&index=51
/watch?v=SGx03Uqn9JA&list=PL3D7BFF1DDBDAAFE5&index=52
/watch?v=oaNus5QigYA&list=PL3D7BFF1DDBDAAFE5&index=53
/watch?v=fV3cpnNPWo0&list=PL3D7BFF1DDBDAAFE5&index=54
/watch?v=zXXCFmfJMNw&list=PL3D7BFF1DDBDAAFE5&index=55
/watch?v=iFoaqeEtTNU&list=PL3D7BFF1DDBDAAFE5&index=56
/watch?v=UlOM-CUlsBc&list=PL3D7BFF1DDBDAAFE5&index=57
/watch?v=XzTSdfLJt04&list=PL3D7BFF1DDBDAAFE5&index=58
/watch?v=iMe4fW31jMs&list=PL3D7BFF1DDBDAAFE5&index=59
/watch?v=BlKDYBqlfgs&list=PL3D7BFF1DDBDAAFE5&index=60
/watch?v=kOJGmVXuuFA&list=PL3D7BFF1DDBDAAFE5&index=61
/watch?v=wUmId0rwsBQ&list=PL3D7BFF1DDBDAAFE5&index=62
/watch?v=0wy907WZFiA&list=PL3D7BFF1DDBDAAFE5&index=63
/watch?v=ZMcYbf9Hhe4&list=PL3D7BFF1DDBDAAFE5&index=64
/watch?v=yowNavIDzzE&list=PL3D7BFF1DDBDAAFE5&index=65
/watch?v=cJUsL7sc1E8&list=PL3D7BFF1DDBDAAFE5&index=66
/watch?v=Od3xkrxcsE8&list=PL3D7BFF1DDBDAAFE5&index=67
/watch?v=iZMNaPgP4Ak&list=PL3D7BFF1DDBDAAFE5&index=68
/watch?v=PmOtvJqDfqY&list=PL3D7BFF1DDBDAAFE5&index=69
/watch?v=ulFq_0x29sI&list=PL3D7BFF1DDBDAAFE5&index=70
/watch?v=Dmq_WGhJbgI&list=PL3D7BFF1DDBDAAFE5&index=71
/watch?v=S36C23lW5qI&list=PL3D7BFF1DDBDAAFE5&index=72
/watch?v=3r9NGjBvv2w&list=PL3D7BFF1DDBDAAFE5&index=73
/watch?v=ioGWpu8Ud7A&list=PL3D7BFF1DDBDAAFE5&index=74
/watch?v=K7YuusyEvOg&list=PL3D7BFF1DDBDAAFE5&index=75
/watch?v=3OhGkg_XT3o&list=PL3D7BFF1DDBDAAFE5&index=76
/watch?v=G8QK452ynr4&list=PL3D7BFF1DDBDAAFE5&index=77
/watch?v=XVPCXNoiYIg&list=PL3D7BFF1DDBDAAFE5&index=78
/watch?v=m1AeMJux0Zo&list=PL3D7BFF1DDBDAAFE5&index=79
/watch?v=o5LrdSQrWEI&list=PL3D7BFF1DDBDAAFE5&index=80
/watch?v=NcKSFlYEqYY&list=PL3D7BFF1DDBDAAFE5&index=81
/watch?v=N2Tx8S2V8ek&list=PL3D7BFF1DDBDAAFE5&index=82
/watch?v=Iy3wCppq2Yc&list=PL3D7BFF1DDBDAAFE5&index=83
/watch?v=lkadcYQ6SuY&list=PL3D7BFF1DDBDAAFE5&index=84
/watch?v=PQ94MmEg0Qw&list=PL3D7BFF1DDBDAAFE5&index=85
/watch?v=DqNzTaf9g5w&list=PL3D7BFF1DDBDAAFE5&index=86
/watch?v=BWGW8UsO4Hc&list=PL3D7BFF1DDBDAAFE5&index=87
/watch?v=b4MYh6N4z6s&list=PL3D7BFF1DDBDAAFE5&index=88
/watch?v=9xGIlaezMAU&list=PL3D7BFF1DDBDAAFE5&index=89
/watch?v=UC2wAuxECw0&list=PL3D7BFF1DDBDAAFE5&index=90
/watch?v=zRqcoUSbMI0&list=PL3D7BFF1DDBDAAFE5&index=91
/watch?v=D2iMtK8ETGs&list=PL3D7BFF1DDBDAAFE5&index=92
/watch?v=PJL8UChOsSk&list=PL3D7BFF1DDBDAAFE5&index=93
/watch?v=QbD6qwxiEUU&list=PL3D7BFF1DDBDAAFE5&index=94
/watch?v=-ZbdfYleuJU&list=PL3D7BFF1DDBDAAFE5&index=95
/watch?v=JVaGZwuYmck&list=PL3D7BFF1DDBDAAFE5&index=96
/watch?v=5pr7jwYF0JU&list=PL3D7BFF1DDBDAAFE5&index=97
/watch?v=MNCAmgFHcOI&list=PL3D7BFF1DDBDAAFE5&index=98
/watch?v=tXR0AlhNYxQ&list=PL3D7BFF1DDBDAAFE5&index=99
/watch?v=GtWXOzsD5Fw&list=PL3D7BFF1DDBDAAFE5&index=100
I am using Python 2.7.12 and beautifulsoup4
Try this one:
import urllib2
from bs4 import BeautifulSoup
htmlParser = "lxml"
url='https://www.youtube.com/playlist?list=PL3D7BFF1DDBDAAFE5'
html=urllib2.urlopen(url)
response=html.read()
soup=BeautifulSoup(response, htmlParser)
links = soup.find_all('a', attrs={'class':'pl-video-title-link'})
for a in links:
print(a.get("href"))
Output:
/watch?v=SUOWNXGRc6g&list=PL3D7BFF1DDBDAAFE5&index=1
/watch?v=857zrsYZKGo&list=PL3D7BFF1DDBDAAFE5&index=2
/watch?v=Da1jlmwuW_w&list=PL3D7BFF1DDBDAAFE5&index=3
/watch?v=MIKl8PX838E&list=PL3D7BFF1DDBDAAFE5&index=4
/watch?v=sPFUTJgvVpQ&list=PL3D7BFF1DDBDAAFE5&index=5
/watch?v=maYFI5O6P-8&list=PL3D7BFF1DDBDAAFE5&index=6
/watch?v=6moe-rLZKCk&list=PL3D7BFF1DDBDAAFE5&index=7
/watch?v=eKXnQ83RU3I&list=PL3D7BFF1DDBDAAFE5&index=8
/watch?v=WjE-pWYElsE&list=PL3D7BFF1DDBDAAFE5&index=9
/watch?v=hUA_isgpTHI&list=PL3D7BFF1DDBDAAFE5&index=10
/watch?v=IHg_0HJ5iQo&list=PL3D7BFF1DDBDAAFE5&index=11
/watch?v=H92G3CpSQf4&list=PL3D7BFF1DDBDAAFE5&index=12
...
Try This-
from bs4 import BeautifulSoup
import requests
def getPlaylistLinks(url):
sourceCode = requests.get(url).text
soup = BeautifulSoup(sourceCode, 'html.parser')
domain = 'https://www.youtube.com'
for link in soup.find_all("a", {"dir": "ltr"}):
href = link.get('href')
if href.startswith('/watch?'):
print(link.string.strip())
print(domain + href + '\n')
getPlaylistLinks('Your URL')

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Resources