Openpyxl, Date time -writing only first cell - datetime

I want to write the names of a set of video files with their corresponding file length to an excel sheet. The list of video populates in the first column. The length of the video does not populate in the excel sheet.
Video1.mp4 | 03:23
Video2.mp4 | 09:33
Video3.mp4 | 04:77
Video4.mp4 | 02:00
I want to populate the names of the video in column1
and length of the videos in columnB.
import os
from mutagen.mp4 import MP4
from datetime import timedelta
from openpyxl import Workbook
path = r'C:\Users\Me\videos'
wb = Workbook()
ws=wb.active
for files in os.listdir(path):
full = os.path.join(path, files)
audio = MP4(full)
formated = timedelta(seconds=audio.info.length)
ws.cell(row=1, column=2).value = timedelta(seconds=audio.info.length)
print(formated)
for r in range(0,len(os.listdir(path))):
ws.cell(row=r+1,column=1).value= os.listdir(path)[r]
wb.save('text.xlsx')

Related

How do I make it so that, these blank spaces are filled up with the next image, instead of opening up another page

How do I make it so that, these blank spaces are filled up with the next image, instead of opening up another page, is there some sort of if statement for docx, which goes like
this is what I mean, visually..
if (space)
-> insert image there
My code:
import docx
from PIL import Image
import os, os.path
from docx.enum.section import WD_ORIENT
from docx.enum.section import WD_SECTION
mydoc = docx.Document()
imgs = []
path1 = "J:/BBIS-G8/Biology/"
valid_images = [".jpg",".gif",".png",".tga"]
for f in os.listdir(path1):
a = 0
ext = os.path.splitext(f)[1]
if ext.lower() not in valid_images:
continue
a += 1
file1 = os.path.join(path1,f)
# print(file1)
imgs.append(Image.open(os.path.join(path1,f)))
current_section = mydoc.sections[0]
new_width, new_height = current_section.page_height, current_section.page_width
new_section = mydoc.add_section(WD_SECTION.NEW_PAGE)
new_section.orientation = WD_ORIENT.LANDSCAPE
new_section.page_width = new_width
new_section.page_height = new_height
mydoc.add_picture(file1, width=docx.shared.Inches(4), height=docx.shared.Inches(5))
mydoc.save("J:/biology.docx")

Converting Multiples wav files into Spectrogram

I'm trying to convert multiple .wav files into spectrograms, and so far I just got to write a code that can convert a single .wav file into Mel Spectrogram and its waveform.
Can anyone help me to rewrite the code for converting multiple .wav files?
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from glob import glob
import os
Audio_path = '01.wav'
frame = 2048
Hop = 521
plt.figure(figsize = (7,5))
signal, sr = librosa.load(Audio_path)
def wave():
#librosa.display.waveshow(signal, sr=sr)
plt.xlabel("Time")
plt.ylabel("Amplitude")
plt.title("Audio Waveform")
plt.show()
def spectro():
stft_scale = np.abs(librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=frame, hop_length=Hop))
stft_scale = librosa.power_to_db(stft_scale, ref=np.max)
librosa.display.specshow(stft_scale, fmax=8000)
#librosa.display.specshow(stft_scale, fmax=8000, y_axis='log', x_axis='time')
#plt.title("Audio Spectrogram")
plt.colorbar(format = '%+2.0f dB')
plt.show()
wave()
spectro()

Scraping multiple pages with Scrapy and saving as a csv file

I want to scrape all the pages of Internshala and extract the Job ID, Job name, Company name and the Last date to apply and store everything in a csv to later convert to a dataframe.
import requests
import scrapy
from bs4 import BeautifulSoup
from scrapy import Selector
from scrapy.crawler import CrawlerProcess
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
import string
import pandas as pd
url='https://internshala.com/fresher-jobs'
sel=Selector(text=BeautifulSoup(requests.get(url).content).prettify())
pages=sel.xpath('//span[#id="total_pages"]').xpath('normalize-space(./text())').extract()
pages[0]=int(pages[0])
print(pages[0]) #which gives -> 4
class jobMan(scrapy.Spider):
name='job'
to_remove={0:["\n ","\n "],\
1:['\n ','\n ']}
def start_requests(self):
urls="https://internshala.com/fresher-jobs/page-1"
yield scrapy.Request(url=urls,callback=self.parse)
def parse(self,response):
ID=response.xpath('//div[#class="container-fluid individual_internship visibilityTrackerItem"]/#internshipid').extract()
Job_Post = response.xpath('//div[#class="heading_4_5 profile"]/a').xpath('normalize-space(./text())').extract()
Company = response.xpath('//a[#class="link_display_like_text"]').xpath('normalize-space(./text())').extract()
Apply_By = response.xpath('//div[#class="internship_other_details_container"]/div[#class="other_detail_item_row"][2]//div[#class="item_body"]').xpath('normalize-space(./text())').extract()
for page in range(2,pages[0]+1):
yield(scrapy.Request(url=f"https://internshala.com/fresher-jobs/page-{page}",callback=self.parse))
yield {
'ID': ID,
'Job':Job_Post,
'Company':Company,
'Apply_By':Apply_By
}
process=CrawlerProcess(settings={
'FEED_URI':'JOBSS.csv',
'FEED_FORMAT':'csv'
})
process.crawl(jobMan)
process.start()
And then finally-:
final=pd.read_csv('JOBSS.csv')
print(final)
Which gave me-:
ID Job \
0 NaN Product Developer - Science,Salesforce Develop...
1 NaN Business Development Manager,Mobile App Develo...
2 NaN Software Engineer,Social Media Strategist And ...
3 NaN Reactjs Developer,Full Stack Developer,Busines...
Company \
0 Open Door Education,Aekot Consulting And Techn...
1 ISB Studienkolleg,TutorBin,Alphacore Technolog...
2 CrewKarma,Internshala,Mithi Software Technolog...
3 Startxlabs Technologies Private Limited,RavGin...
Apply_By
0 7 Aug' 21,7 Aug' 21,7 Aug' 21,7 Aug' 21,7 Aug'...
1 31 Jul' 21,30 Jul' 21,30 Jul' 21,31 Jul' 21,30...
2 24 Jul' 21,24 Jul' 21,23 Jul' 21,23 Jul' 21,23...
3 11 Jul' 21,11 Jul' 21,11 Jul' 21,11 Jul' 21,11...
Doubt_1-: Why is it not printing the IDs ?? I tried scraping just the ID for the first page using the same xpath and I got the correct output but not while crawling.
/
Doubt_2-: I wanted a a dataframe such that, for example, the Job_Post column contains each job post's name nested under each other (means as a new row) from all the pages merged but I am getting rows per page.
How can I solve these issues ?? Please help
Doubt_1-: Why is it not printing the IDs ?? I tried scraping just the ID for the first page using the same xpath and I got the correct output but not while crawling.
Because the class name has a space in it, use:
ID=response.xpath('//div[contains(#class, "container-fluid individual_internship visibilityTrackerItem")]/#internshipid').extract()

How to import data from a HTML table on a website to excel?

I would like to do some statistical analysis with Python on the live casino game called Crazy Time from Evolution Gaming. There is a website that has the data to do this: https://tracksino.com/crazytime. I want the data of the lowest table 'Spin History' to be imported into excel. However, I do not now how this can be done. Could anyone give me an idea where to start?
Thanks in advance!
Try the below code:
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import csv
import datetime
def scrap_history():
csv_headers = []
file_path = '' #mention your system where you have to save the file
file_name = 'spin_history.csv' # filename
page_number = 1
while True:
#Dynamic URL fetching data in chunks of 100
url = 'https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=' + str(page_number) + '&per_page=100&period=24hours'
print('-' * 100)
print('URL created : ',url)
response = requests.get(url,verify=False)
result = json.loads(response.text) # loading data to convert in JSON.
history_data = result['data']
print(history_data)
if history_data != []:
with open(file_path + file_name ,'a+') as history:
#Headers for file
csv_headers = ['Occured At','Slot Result','Spin Result','Total Winners','Total Payout',]
csvwriter = csv.DictWriter(history, delimiter=',', lineterminator='\n',fieldnames=csv_headers)
if page_number == 1:
print('Writing CSV header now...')
csvwriter.writeheader()
#write exracted data in to csv file one by one
for item in history_data:
value = datetime.datetime.fromtimestamp(item['when'])
occured_at = f'{value:%d-%B-%Y # %H:%M:%S}'
csvwriter.writerow({'Occured At':occured_at,
'Slot Result': item['slot_result'],
'Spin Result': item['result'],
'Total Winners': item['total_winners'],
'Total Payout': item['total_payout'],
})
print('-' * 100)
page_number +=1
print(page_number)
print('-' * 100)
else:
break
Explanation:
I have implemented the above script using python requests way. The API url https://api.tracksino.com/crazytime_history?filter=&sort_by=&sort_desc=false&page_num=1&per_page=50&period=24hours extarcted from the web site itself(refer screenshot). In the very first step script will take the dynamic URL where page number is dynamic and changed upon on every iteration. For ex:- first it will be page_num = 1 then page_num = 2 and so on till all the data will get extracted.

pyspark how to sum and produce top 10 using pyspark

I have a csv file with two fields, a key and a value:
{1Y4dZ123eAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZ123433MGooBmVzBLUWEZ1234CUY91},8.530366
{1YdZ2344AMGooBmVzBLUWE123JfCCUY91},8.530366
{1YdECDNthiMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBDJTdBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZ123qeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBm123LUWEZ2JfCCUY91},8.530366
{17RJgv5ujkFerSd48Akdd2GneUAW47nphQ},20.0
{17RJgv5ujkFerSd48Akdd2GneUAW47nphQ},20.0
{17RJgv5ujkFerSd48Akdd2GneUAW47nphQ},20.0
{13uZ6tSr5oh1ui9Hd1tEqJKo2AHhJ6JdFS},0.03895804
What I'm trying to do is sum up the second column and group by the first column, then derive the top 10 keys with the highest values.
Below is the code I've tried using but I get a 'tuple index out of range' error:
import re
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.session import SparkSession
sc = pyspark.SparkContext()
spark = SparkSession(sc)
voutFile = sc.textFile("input/voutfiltered.csv")
features=voutFile.map(lambda l:
(l.split(',')[0],float(l.split(',')[1])))
top10 = features.takeOrdered(10, key = lambda x: -x[2])
for record in top10:
print("{}: {};{}".format(record[0],record[1],record[2]))```
Any particular reason why you're not using the DataFrame API? It's much more flexible, convenient and faster than the RDD API.
import pyspark.sql.functions as f
df = spark.read.format("csv").option("header", "true").load("/path/to/your/file.csv/")
(df.groupBy(f.col("key_col"))
.agg(f.count(f.col("value_col")).alias("count_value_col"))
.sort(col("count_value_col").desc())
.limit(10)
.show())

Resources