I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)
Related
I'm trying to run a simple PyQt5 application on Linux, the code is as follows:
#!/usr/bin/python
import sys
from PyQt5.QtWidgets import QApplication, QWidget
def main():
app = QApplication(sys.argv)
w = QWidget()
w.resize(250, 150)
w.move(300, 300)
w.setWindowTitle('Simple')
w.show()
mime = app.clipboard().mimeData()
print(mime.hasImage()) # True
print(mime.imageData()) # None
sys.exit(app.exec_())
if __name__ == '__main__':
main()
Before running it, I copied an image into the clipboard, so mime.hasImage() should return True. No problem, that's also the case. But what's weird is, mime.imageData() sometimes returns None. that shouldn't happen. mime.imageData() should contain the image that I copied instead of None. Is there anything wrong with the code?
By the way, this seems to only happen on Linux, mime.imageData() never returns None on Windows. I'm using python3
That hasImage() returns True does not imply that imageData() returns a QImage since it only indicates that the user copied an image to the clipboard, and in what format do I copy the image? Well, it could be png, jpg, etc or it could provide the url for the client application to download or html to insert it into the client application and then obtain the image by rendering the HTML.
So in general the application from which the image was copied is responsible for the sending format and that there is no restrictive standard for that format but there are common formats.
The following example shows the logic to handle the images that come from urls and HTML:
#!/usr/bin/python
import sys
from functools import cached_property
from PyQt5.QtCore import pyqtSignal, QObject, QUrl
from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply
from PyQt5.QtGui import QGuiApplication, QImage, QPixmap
from PyQt5.QtWidgets import QApplication, QWidget, QLabel
from bs4 import BeautifulSoup
class ImageDownloader(QObject):
finished = pyqtSignal(QImage)
def __init__(self, parent=None):
super().__init__(parent)
self.manager.finished.connect(self.handle_finished)
#cached_property
def manager(self):
return QNetworkAccessManager()
def start_download(self, url):
self.manager.get(QNetworkRequest(url))
def handle_finished(self, reply):
if reply.error() != QNetworkReply.NoError:
print("error: ", reply.errorString())
return
image = QImage()
image.loadFromData(reply.readAll())
self.finished.emit(image)
class ClipboardManager(QObject):
imageChanged = pyqtSignal(QImage)
def __init__(self, parent=None):
super().__init__(parent)
QGuiApplication.clipboard().dataChanged.connect(
self.handle_clipboard_datachanged
)
self.downloader.finished.connect(self.imageChanged)
#cached_property
def downloader(self):
return ImageDownloader()
def handle_clipboard_datachanged(self):
mime = QGuiApplication.clipboard().mimeData()
if mime.hasImage():
image = mime.imageData()
if image is not None:
self.imageChanged.emit(image)
elif mime.hasUrls():
url = mime.urls()[0]
self.downloader.start_download(urls[0])
elif mime.hasHtml():
html = mime.html()
soup = BeautifulSoup(html, features="lxml")
imgs = soup.findAll("img")
if imgs:
url = QUrl.fromUserInput(imgs[0]["src"])
self.downloader.start_download(url)
else:
for fmt in mime.formats():
print(fmt, mime.data(fmt))
def main():
app = QApplication(sys.argv)
label = QLabel(scaledContents=True)
label.resize(250, 150)
label.move(300, 300)
label.setWindowTitle("Simple")
label.show()
manager = ClipboardManager()
manager.imageChanged.connect(
lambda image: label.setPixmap(QPixmap.fromImage(image))
)
sys.exit(app.exec_())
if __name__ == "__main__":
main()
I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
"""
Scrapping of all county-level raw data from
http://www.countyhealthrankings.org for all years. Data stored in RawData
folder.
Code modified from https://null-byte.wonderhowto.com/how-to/download-all-
pdfs-webpage-with-python-script-0163031/
"""
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
"""
Get all links
"""
def getAllLinks(url):
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
getAllLinks("http://www.countyhealthrankings.org/app/iowa/2017/downloads")
download(links)
(By the way, my desired link looks like this.)
Thanks!
This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('http://www.countyhealthrankings.org/app/iowa/2017/downloads')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
30
>>> for link in links:
... link.get_attribute('href')
...
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2017_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Iowa%20Data%20-%20v1.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2016_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2015_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2014_IA_v2.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Iowa%20Data%20-%20v6.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2013_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Iowa%20Data%20-%20v1_0.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2012_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2011_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2010_IA_0.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2010%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
I am having an issue with my web crawler. It can run through any regular old website like a charm, but when it runs into a https protocol it doesn't seem to work.
This is the error I am getting when I try to run a https url through my crawler (name 'htmltext' is not defined)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse
#SourceUrl
url = "https://en.wikipedia.org/wiki/Main_Page"
urls = [url]
z = urlparse(urls[0])
TopLevel = z.scheme+'://'+z.netloc
visited =[url]
robotsUrl = TopLevel +'/robots.txt'
while len(urls) < 100:
try:
htmltext = urllib.request.urlopen(urls[0]).read()
robots = urllib.request.urlopen(robotsUrl).read()
disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
except:
print (urls[0])
sourceCode = BeautifulSoup(htmltext, "html.parser")
urls.pop(0)
print(len(urls))
for link in sourceCode.findAll('a', href=True):
if "http://" not in link['href']:
link['href'] = urllib.parse.urljoin(url,link['href'])
in_disallow = False
for i in range(len(disallowList)):
if (disallowList[i]).upper().decode() in link['href'].upper():
in_disallow = True
break
if not in_disallow:
if link['href'] not in visited:
urls.append(link['href'])
visited.append(link['href'])
print (visited)
I am trying to download the content of both the links.The first link works properly and the content is downloaded as a html file.But the content of the 2 nd link(a text file) is not downloaded.Please help me out.
Here is my code :
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
import sys
import codecs
class Downloader(QObject):
# To be emitted when every items are downloaded
done = Signal()
def __init__(self, urlList, parent = None):
super(Downloader, self).__init__(parent)
self.urlList = urlList
self.counter = 0
# As you probably don't need to display the page
# you can use QWebPage instead of QWebView
self.page = QWebPage(self)
self.page.loadFinished.connect(self.save)
self.startNext()
def currentUrl(self):
return self.urlList[self.counter][0]
def currentFilename(self):
return self.urlList[self.counter][1]
def startNext(self):
print "Downloading %s..."%self.currentUrl()
self.page.mainFrame().load(self.currentUrl())
def save(self, ok):
if ok:
data = self.page.mainFrame().toHtml()
with codecs.open(self.currentFilename(), encoding="utf-8", mode="w") as f:
f.write(data)
print "Saving %s to %s."%(self.currentUrl(), self.currentFilename())
else:
print "Error while downloading %s\nSkipping."%self.currentUrl()
self.counter += 1
if self.counter < len(self.urlList):
self.startNext()
else:
self.done.emit()
urlList = [("http://www.nsf.gov/awardsearch/simpleSearchResult?queryText=8","nsf.html"),
("http://www.nsf.gov/awardsearch/ExportResultServlet?exportType=txt","a.txt")]
app = QApplication(sys.argv)
downloader = Downloader(urlList)
# Quit when done
downloader.done.connect(app.quit)
# To view the pages
web = QWebView()
# To prevent user action that would interrupt the current page loading
web.setDisabled(True)
web.setPage(downloader.page)
web.show()
sys.exit(app.exec_())
I am opening a page in QtWebView (in PyQt if that matters) and I want to open all links in the system default browser. I.e. a click on a link should not change the site in the QtWebView but it should open it with the default browser. I want to make it impossible to the user to change the site in the QtWebView.
How can I do that?
Thanks,
Albert
That does it:
import sys, webbrowser
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("http://www.az2000.de/projects/javascript-project/"))
web.page().setLinkDelegationPolicy(QWebPage.DelegateAllLinks)
def linkClicked(url): webbrowser.open(str(url.toString()))
web.connect(web, SIGNAL("linkClicked (const QUrl&)"), linkClicked)
web.show()
sys.exit(app.exec_())
Updated example for PyQt5 (the magic is to re-implement the "acceptNavigationRequest" method):
from PyQt5 import QtWidgets, QtCore, QtGui, QtWebEngineWidgets
class RestrictedQWebEnginePage(QtWebEngineWidgets.QWebEnginePage):
""" Filters links so that users cannot just navigate to any page on the web,
but just to those pages, that are listed in allowed_pages.
This is achieved by re-implementing acceptNavigationRequest.
The latter could also be adapted to accept, e.g. URLs within a domain."""
def __init__(self, parent=None):
super().__init__(parent)
self.allowed_pages = []
def acceptNavigationRequest(self, qurl, navtype, mainframe):
# print("Navigation Request intercepted:", qurl)
if qurl in self.allowed_pages: # open in QWebEngineView
return True
else: # delegate link to default browser
QtGui.QDesktopServices.openUrl(qurl)
return False
class RestrictedWebViewWidget(QtWidgets.QWidget):
"""A QWebEngineView is required to display a QWebEnginePage."""
def __init__(self, parent=None, url=None, html_file=None):
super().__init__(parent)
self.view = QtWebEngineWidgets.QWebEngineView()
self.page = RestrictedQWebEnginePage()
if html_file:
print("Loading File:", html_file)
self.url = QtCore.QUrl.fromLocalFile(html_file)
self.page.allowed_pages.append(self.url)
self.page.load(self.url)
elif url:
print("Loading URL:", url)
self.url = QtCore.QUrl(url)
self.page.allowed_pages.append(self.url)
self.page.load(self.url)
# associate page with view
self.view.setPage(self.page)
# set layout
self.vl = QtWidgets.QVBoxLayout()
self.vl.addWidget(self.view)
self.setLayout(self.vl)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
web = RestrictedWebViewWidget(url="YOUR URL") # or YOUR local HTML file
web.show()
sys.exit(app.exec_())
When you click a link that has the target="_blank" attribute, QT calls the CreateWindow method in QWebEnginePage to create a new tab/new window.
The key is to re-implement this method to, instead of opening a new tab, open a new browser window.
class WebEnginePage(QtWebEngineWidgets.QWebEnginePage):
def createWindow(self, _type):
page = WebEnginePage(self)
page.urlChanged.connect(self.open_browser)
return page
def open_browser(self, url):
page = self.sender()
QDesktopServices.openUrl(url)
page.deleteLater()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.url = QUrl("https://stackoverflow.com/")
self.webView = QWebEngineView()
self.page = WebEnginePage(self.webView)
self.webView.setPage(self.page)
self.webView.load(self.url)
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
web = MainWindow()
web.show()
sys.exit(app.exec_())