Automate pdf downloads using python - urllib3

I want to automatically download new pdf files as they are published on a collection of online libraries. I tried using the following python code but found it doesn't work with urllib 3. Anyone know how I can replicate it?
import urllib2
def main():
download_file("http://mensenhandel.nl/files/pdftest2.pdf")
def download_file(download_url):
response = urllib2.urlopen(download_url)
file = open("document.pdf", 'wb')
file.write(response.read())
file.close()
print("Completed")
if __name__ == "__main__":
main()

Your solution is good. I just did a few minor adjustments. The first thing is you need to use the content method of the response. Below is the code -
import requests
def main():
download_file("http://mensenhandel.nl/files/pdftest2.pdf")
def download_file(download_url):
response = requests.get(download_url, stream = True)
with open("document.pdf", 'wb') as file:
file.write(response.content)
file.close()
print("Completed")
if __name__ == "__main__":
main()

Related

How do i resolve Async tornado fetching future error

I am trying to use AsyncHTTPClient to Get/Post from a local service that is already running at port 6000.
but i keep getting an error RuntimeError: Task got bad yield: <tornado.concurrent.Future object at 0x03C9B490>
ps. im using tornado 4.4.2, this error is fixed with the latest version but how do i do it in 4.4.2? Please help!
import tornado.ioloop
from tornado.httpclient import AsyncHTTPClient
import asyncio
import tornado
import urllib
from datetime import datetime
import time
async def client(url):
http_client = AsyncHTTPClient()
response = await http_client.fetch(url)
return response.body
async def main():
http_client = AsyncHTTPClient()
url = "http://localhost:6000/listings"
result = await client(url)
print(result)
if __name__ == "__main__":
result = asyncio.run(main())
print(result)
print(int(time.time() * 1e6))
You can't use asyncio with Tornado prior to version 5.0.
Use Tornado's own ioloop to run your program:
from tornado import ioloop
if __name__ == "__main__":
result = ioloop.IOLoop.current().run_sync(main)
UPDATE: The above solution will work fine, but, if you want, you can use asyncio with Tornado 4.x. See: tornado.platform.asyncio.AsyncIOMainLoop.

Empty image in QClipboard [duplicate]

I'm trying to run a simple PyQt5 application on Linux, the code is as follows:
#!/usr/bin/python
import sys
from PyQt5.QtWidgets import QApplication, QWidget
def main():
app = QApplication(sys.argv)
w = QWidget()
w.resize(250, 150)
w.move(300, 300)
w.setWindowTitle('Simple')
w.show()
mime = app.clipboard().mimeData()
print(mime.hasImage()) # True
print(mime.imageData()) # None
sys.exit(app.exec_())
if __name__ == '__main__':
main()
Before running it, I copied an image into the clipboard, so mime.hasImage() should return True. No problem, that's also the case. But what's weird is, mime.imageData() sometimes returns None. that shouldn't happen. mime.imageData() should contain the image that I copied instead of None. Is there anything wrong with the code?
By the way, this seems to only happen on Linux, mime.imageData() never returns None on Windows. I'm using python3
That hasImage() returns True does not imply that imageData() returns a QImage since it only indicates that the user copied an image to the clipboard, and in what format do I copy the image? Well, it could be png, jpg, etc or it could provide the url for the client application to download or html to insert it into the client application and then obtain the image by rendering the HTML.
So in general the application from which the image was copied is responsible for the sending format and that there is no restrictive standard for that format but there are common formats.
The following example shows the logic to handle the images that come from urls and HTML:
#!/usr/bin/python
import sys
from functools import cached_property
from PyQt5.QtCore import pyqtSignal, QObject, QUrl
from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply
from PyQt5.QtGui import QGuiApplication, QImage, QPixmap
from PyQt5.QtWidgets import QApplication, QWidget, QLabel
from bs4 import BeautifulSoup
class ImageDownloader(QObject):
finished = pyqtSignal(QImage)
def __init__(self, parent=None):
super().__init__(parent)
self.manager.finished.connect(self.handle_finished)
#cached_property
def manager(self):
return QNetworkAccessManager()
def start_download(self, url):
self.manager.get(QNetworkRequest(url))
def handle_finished(self, reply):
if reply.error() != QNetworkReply.NoError:
print("error: ", reply.errorString())
return
image = QImage()
image.loadFromData(reply.readAll())
self.finished.emit(image)
class ClipboardManager(QObject):
imageChanged = pyqtSignal(QImage)
def __init__(self, parent=None):
super().__init__(parent)
QGuiApplication.clipboard().dataChanged.connect(
self.handle_clipboard_datachanged
)
self.downloader.finished.connect(self.imageChanged)
#cached_property
def downloader(self):
return ImageDownloader()
def handle_clipboard_datachanged(self):
mime = QGuiApplication.clipboard().mimeData()
if mime.hasImage():
image = mime.imageData()
if image is not None:
self.imageChanged.emit(image)
elif mime.hasUrls():
url = mime.urls()[0]
self.downloader.start_download(urls[0])
elif mime.hasHtml():
html = mime.html()
soup = BeautifulSoup(html, features="lxml")
imgs = soup.findAll("img")
if imgs:
url = QUrl.fromUserInput(imgs[0]["src"])
self.downloader.start_download(url)
else:
for fmt in mime.formats():
print(fmt, mime.data(fmt))
def main():
app = QApplication(sys.argv)
label = QLabel(scaledContents=True)
label.resize(250, 150)
label.move(300, 300)
label.setWindowTitle("Simple")
label.show()
manager = ClipboardManager()
manager.imageChanged.connect(
lambda image: label.setPixmap(QPixmap.fromImage(image))
)
sys.exit(app.exec_())
if __name__ == "__main__":
main()

Issue while adding a job in APScheduler(Python 3.6)

I am trying to schedule my python script (merchant.py) once a week using APScheduler
For this purpose, I have installed the package and when I run the following example code with cronjob option-
from datetime import datetime
import time
import os
from apscheduler.schedulers.background import BackgroundScheduler
def tick():
print('Tick! The time is: %s' % datetime.now())
if __name__ == '__main__':
scheduler = BackgroundScheduler()
scheduler.add_job(tick, 'cron', day_of_week = 'mon', hour = 11)
scheduler.start()
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
try:
while True:
time.sleep(2)
except (KeyboardInterrupt, SystemExit):
scheduler.shutdown()
I am getting following error-
KeyError: 'cron'
LookupError: No trigger by the name "cron" was found
Could you please correct me what I am doing wrong here?
I found one solution so sharing for others-
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.combining import OrTrigger
from apscheduler.triggers.cron import CronTrigger
import time
sched = BlockingScheduler()
trigger = OrTrigger([
CronTrigger(day_of_week='thu', hour='07', minute='28')
])
sched.add_job(YOUR_JOB, trigger)
sched.start()
try:
while True:
time.sleep(2)
except (KeyboardInterrupt, SystemExit):
sched.shutdown()

Cannot find the desired link for download (Python BeautifulSoup)

I am pretty new to Python Beautiful Soup and I don't have much knowledge about html or js. I tried to use bs4 to download all xls files in this page, but it seems that bs4 cannot find the links under "attachment" section. Could someone help me out?
My current code is:
"""
Scrapping of all county-level raw data from
http://www.countyhealthrankings.org for all years. Data stored in RawData
folder.
Code modified from https://null-byte.wonderhowto.com/how-to/download-all-
pdfs-webpage-with-python-script-0163031/
"""
from bs4 import BeautifulSoup
import urlparse
import urllib2
import os
import sys
"""
Get all links
"""
def getAllLinks(url):
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
links = soup.find_all('a', href=True)
return links
def download(links):
for link in links:
#raw_input("Press Enter to continue...")
#print link
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))
#print "------------------------------------"
#print os.path.splitext(os.path.basename(link['href']))[1]
suffix = os.path.splitext(os.path.basename(link['href']))[1]
if os.path.splitext(os.path.basename(link['href']))[1] == '.xls':
print link #cannot find anything
currentLink = urllib2.urlopen(link)
links =
getAllLinks("http://www.countyhealthrankings.org/app/iowa/2017/downloads")
download(links)
(By the way, my desired link looks like this.)
Thanks!
This seems to be one of the tasks for which BeautifulSoup (in itself, at least) is inadequate. You can, however, do it with selenium.
>>> from selenium import webdriver
>>> driver = webdriver.Chrome()
>>> driver.get('http://www.countyhealthrankings.org/app/iowa/2017/downloads')
>>> links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
>>> len(links)
30
>>> for link in links:
... link.get_attribute('href')
...
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2017_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Iowa%20Data%20-%20v1.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2016_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2015_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Iowa%20Data%20-%20v3.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/CHR2014_IA_v2.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Iowa%20Data%20-%20v6.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2013_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Iowa%20Data%20-%20v1_0.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2012_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2011_IA.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Outcomes%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20Health%20Factors%20-%20Iowa.png'
'http://www.countyhealthrankings.org/sites/default/files/states/CHR2010_IA_0.pdf'
'http://www.countyhealthrankings.org/sites/default/files/state/downloads/2010%20County%20Health%20Ranking%20Iowa%20Data%20-%20v2.xls'

how to load multiple pages one by one in QWebPage

I am trying to crawl news article pages for comments. After some research I found that mostly websites use an iframe for it. I want to get the "src" of the iframe. I am using QtWebKit in Python using PySide. It is actually working but just once. It is not loading other webpages. I am using the following code:
import sys
import pymysql
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from pprint import pprint
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
try:
self.app = QApplication(sys.argv)
except RuntimeError:
self.app = QCoreApplication.instance()
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def visit(url):
r = Render(url)
p = r.frame.toHtml()
f_url = str(r.frame.url().toString())
return p
def is_comment_url(url):
lower_url = url.lower()
n = lower_url.find("comment")
if n>0:
return True
else:
return False
with open("urls.txt") as f:
content = f.read().splitlines()
list_of_urls = []
for url in content:
page = visit(url)
soup = BeautifulSoup(page)
for tag in soup.findAll('iframe', src=True):
link = tag['src']
if is_comment_url(link):
print(link)
list_of_urls += link
pprint(list_of_urls)
But the issue is it works only for single iteration and gets stuck.
Also is there any way to save a web page as it is as displayed by the browser (after executing all the javascript etc.)

Resources