I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
Scrapping of all county-level raw data from for all years. Data stored in RawData
Code modified from
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
Get all links
def getAllLinks(url):
soup = BeautifulSoup(,"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
(By the way, my desired link looks like this.)

This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
>>> for link in links:
... link.get_attribute('href')


Beautiful Soap Scraping Content

Is there a way to get the number (13) at the very end?
I tried below code:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_page = urlopen(req).read()
soup = BeautifulSoup(web_page, 'html.parser')
mydivs = soup.find_all("div", {"class": "tahminMax"})[0]
and received following output:
<div class="tahminMax"><span class="deger" ng-bind="gunlukTahmin[0].enYuksekGun1 | kaliteKontrol"></span><span class="derece">°C</span></div>
Values are retrieved dynamically from another xhr call you can find in the network tab. You can extract them as follows:
import requests
headers = {'Origin': ''}
r = requests.get('', headers=headers).json()
d = {i['tarih']:i['maksimumRuzgarHizi'] for i in r[0]['tahmin']}
The site is handled by JS events which loaded after site loading. Below you can achieve your goal using selenium.
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
url = ''
sada = browser.get(url)
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger ng-binding'}):
also here's BeautifulSoup doing the task but the output of 13 will not be loaded.
from bs4 import BeautifulSoup
import requests
r = requests.get('')
soup = BeautifulSoup(r.text, 'html.parser')
for tag in soup.findAll("div", attrs={"class": "tahminMax"}):
for span in tag.findAll('span', attrs={'class': 'deger', 'ng-bind': True}):

Find_by_xpath results with errors

I'm Bart and I am new into Python and this is my first post here.
As a fan of whisky I wanted to scrape some shops to give me recent deals on whisky, however, I stuck with Asda's page. I browsed here for ages but without any luck hence my post.
Thank you.
Browser is opening, and closing as expected.
below is my creation:
Import libraries
# import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
# import pandas as pd
# import requests
from selenium.webdriver.firefox.options import Options as FirefoxOptions
# specify url
#url = ""
url = ""
# run webdriver with headless option
options = FirefoxOptions()
driver = webdriver.Firefox(options=options)
# get page
# execute script to scroll down the page
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;')
# sleep for 30s
# close driver
# find element by xpath
results = driver.find_elements_by_xpath("//*[#id='componentsContainer']//*[#id='listingsContainer']//*[#class='product active']//*[#class='title productTitle']")
"""soup = BeautifulSoup(browser.page_source, 'html.parser')"""
print('Number of results', len(results))
Here is the output.
Traceback (most recent call last):
File "D:/PycharmProjects/Giraffe/", line 29, in <module>
results = driver.find_elements_by_xpath("//*[#id='componentsContainer']//*[#id='listingsContainer']//*[#class='product active']//*[#class='title productTitle']")
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\", line 410, in find_elements_by_xpath
return self.find_elements(by=By.XPATH, value=xpath)
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\", line 1007, in find_elements
'value': value})['value'] or []
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\", line 321, in execute
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: Tried to run command without establishing a connection
Process finished with exit code 1
I tried to stick to the way you have already written. Do not go for hardcoded delay as that is always inconsistent. Try to opt for Explicit Wait. That said this is how you can get the result:
from selenium import webdriver
from import By
from import WebDriverWait
from import expected_conditions as EC
url = ""
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
item = wait.until(EC.presence_of_element_located((By.XPATH, "//h3[#class='co-product-list__title']")))
results = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//li[contains(#class,'co-item')]//*[#class='co-product__title']/a")))
print('Number of results:', len(results))
Number of results: 61

import start_urls from a csv file in Scrapy

I recently start web-scraping using scrapy, I generated a list of urls that I want to scrape from into a txt document separate by a new line. This is my crawler code:
import scrapy
import csv
import sys
from realtor.items import RealtorItem
from scrapy.spider import BaseSpider
#from scrapy.selector import HtmlXPathSelector
#from realtor.items import RealtorItem
class RealtorSpider(scrapy.Spider):
name = "realtor"
allowed_domains = [""]
with open('realtor2.txt') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites ='//div/li/div/a/#href')
sites = response.xpath('//a[contains(#href, "/realestateandhomes-detail/")]')
items = []
for site in sites:
item = RealtorItem()
item['link'] = site.xpath('#href').extract()
return items
now my goal is to read the links from realtor2.txt and start parsing through them, however I get a valueError missing scheme in request URL :
File "C:\Users\Ash\Anaconda2\lib\site-packages\scrapy\http\request\", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url:
2017-06-25 22:28:35 [scrapy.core.engine] INFO: Closing spider (finished)
I think there may be an issue while defining start_urls, but I dont know how to proceed,
"ValueError: Missing scheme in request url" means that you are missing http.
You can use urljoin to avoid this problem.

Scraping YouTube playlist video links

I wanted to download all videos of this Youtube channel. So I tried to write a script with BeautifulSoup to scrape all the links of the videos.
I did some inspection and found out that the "tr class="pl-video yt-uix-tile" can be used to get the links. This is the Python code:
import urllib2
from bs4 import BeautifulSoup
res=soup.find_all('tr',class_="pl-video yt-uix-tile ")
print res
But I am not able to get all the links. The output is empty. What can be done to resolve this?
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('')
page = r.text
for l in res:
print l.get("href")
It is giving me output as below in pycharm:
And below output on cmd:
I am using Python 2.7.12 and beautifulsoup4
Try this one:
import urllib2
from bs4 import BeautifulSoup
htmlParser = "lxml"
soup=BeautifulSoup(response, htmlParser)
links = soup.find_all('a', attrs={'class':'pl-video-title-link'})
for a in links:
Try This-
from bs4 import BeautifulSoup
import requests
def getPlaylistLinks(url):
sourceCode = requests.get(url).text
soup = BeautifulSoup(sourceCode, 'html.parser')
domain = ''
for link in soup.find_all("a", {"dir": "ltr"}):
href = link.get('href')
if href.startswith('/watch?'):
print(domain + href + '\n')
getPlaylistLinks('Your URL')

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try: = QApplication(sys.argv)
except RuntimeError: = QCoreApplication.instance()
def _loadFinished(self, result):
self.frame = self.mainFrame()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
return False
with open("urls.txt") as f:
content =
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
list_of_urls += link
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)
