Converting Multiples wav files into Spectrogram - wav

I'm trying to convert multiple .wav files into spectrograms, and so far I just got to write a code that can convert a single .wav file into Mel Spectrogram and its waveform.
Can anyone help me to rewrite the code for converting multiple .wav files?
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from glob import glob
import os
Audio_path = '01.wav'
frame = 2048
Hop = 521
plt.figure(figsize = (7,5))
signal, sr = librosa.load(Audio_path)
def wave():
#librosa.display.waveshow(signal, sr=sr)
plt.xlabel("Time")
plt.ylabel("Amplitude")
plt.title("Audio Waveform")
plt.show()
def spectro():
stft_scale = np.abs(librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=frame, hop_length=Hop))
stft_scale = librosa.power_to_db(stft_scale, ref=np.max)
librosa.display.specshow(stft_scale, fmax=8000)
#librosa.display.specshow(stft_scale, fmax=8000, y_axis='log', x_axis='time')
#plt.title("Audio Spectrogram")
plt.colorbar(format = '%+2.0f dB')
plt.show()
wave()
spectro()

Related

Openpyxl, Date time -writing only first cell

I want to write the names of a set of video files with their corresponding file length to an excel sheet. The list of video populates in the first column. The length of the video does not populate in the excel sheet.
Video1.mp4 | 03:23
Video2.mp4 | 09:33
Video3.mp4 | 04:77
Video4.mp4 | 02:00
I want to populate the names of the video in column1
and length of the videos in columnB.
import os
from mutagen.mp4 import MP4
from datetime import timedelta
from openpyxl import Workbook
path = r'C:\Users\Me\videos'
wb = Workbook()
ws=wb.active
for files in os.listdir(path):
full = os.path.join(path, files)
audio = MP4(full)
formated = timedelta(seconds=audio.info.length)
ws.cell(row=1, column=2).value = timedelta(seconds=audio.info.length)
print(formated)
for r in range(0,len(os.listdir(path))):
ws.cell(row=r+1,column=1).value= os.listdir(path)[r]
wb.save('text.xlsx')

Download multiple 10-ks documents

I need to download multiple 10-ks documents, however, this code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function]. Here's code.
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
from tqdm import tqdm
Here's the py file that includes project_helper functions.
import matplotlib.pyplot as plt
import requests
from ratelimit import limits, sleep_and_retry
class SecAPI(object):
SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#staticmethod
#sleep_and_retry
# Dividing the call limit by half to avoid coming close to the limit
#limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
def _call_sec(url):
return requests.get(url)
def get(self, url):
return self._call_sec(url).text
def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
indentation = ' '
print('[')
for ten_k in ten_k_data:
print_statement = '{}{{'.format(indentation)
for field in fields:
value = str(ten_k[field])
# Show return lines in output
if isinstance(value, str):
value_str = '\'{}\''.format(value.replace('\n', '\\n'))
else:
value_str = str(value)
# Cut off the string if it gets too long
if len(value_str) > field_length_limit:
value_str = value_str[:field_length_limit] + '...'
print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)
print_statement += '},'
print(print_statement)
print(']')
The first step it to download NLP Corpora.
nltk.download('stopwords')
nltk.download('wordnet')
Than Get 10ks
#cik_lookup = {
# 'GOOGL':'0001288776',
# 'AAPL':'0000320193',
# 'FACEBOOK':'0001326801',
# 'AMZN':'0001018724',
# 'MSFT':'0000789019'}
cik_lookup = {
'AEP': '0000004904',
'AXP': '0000004962',
'BA': '0000012927',
'BK': '0001390777',
'CAT': '0000018230',
'DE': '0000315189',
'DIS': '0001001039',
'DTE': '0000936340',
'ED': '0001047862',
'EMR': '0000032604',
'ETN': '0001551182',
'GE': '0000040545',
'IBM': '0000051143',
'IP': '0000051434',
'JNJ': '0000200406',
'KO': '0000021344',
'LLY': '0000059478',
'MCD': '0000063908',
'MO': '0000764180',
'MRK': '0000310158',
'MRO': '0000101778',
'PCG': '0001004980',
'PEP': '0000077476',
'PFE': '0000078003',
'PG': '0000080424',
'PNR': '0000077360',
'SYY': '0000096021',
'TXN': '0000097476',
'UTX': '0000101829',
'WFC': '0000072971',
'WMT': '0000104169',
'WY': '0000106535',
'XOM': '0000034088'}
Get list of 10-ks
sec_api = project_helper.SecAPI()
from bs4 import BeautifulSoup
def get_sec_data(cik, doc_type, start=0, count=60):
newest_pricing_data = pd.to_datetime('2021-01-01')
rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
'&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
.format(cik, doc_type, start, count)
sec_data = sec_api.get(rss_url)
feed = BeautifulSoup(sec_data.encode('utf-8'), 'xml').feed
entries = [
(
entry.content.find('filing-href').getText(),
entry.content.find('filing-type').getText(),
entry.content.find('filing-date').getText())
for entry in feed.find_all('entry', recursive=False)
if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
return entries
example_ticker = 'AEP'
sec_data = {}
for ticker, cik in cik_lookup.items():
sec_data[ticker] = get_sec_data(cik, '10-K')
The code works fine if i download the 10-ks between 5-10 companies. But if i increase the number of companies in [cik_lookup function] I get the following error. The first error I got is as below.
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-8-28a784054794> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-8-28a784054794> in get_sec_data(cik, doc_type, start, count)
5 rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' .format(cik, doc_type, start, count)
6 sec_data = sec_api.get(rss_url)
----> 7 feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
8 entries = [
9 (
UnicodeEncodeError: 'ascii' codec can't encode characters in position 2599-2601: ordinal not in range(128)
However, after some google search over BeutifulSoup(ecodes) I changed it to utf-8 and then got the following error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-9c77ed07af2d> in <module>()
20
21 for ticker, cik in cik_lookup.items():
---> 22 sec_data[ticker] = get_sec_data(cik, '10-K')
<ipython-input-9-9c77ed07af2d> in get_sec_data(cik, doc_type, start, count)
11 entry.content.find('filing-type').getText(),
12 entry.content.find('filing-date').getText())
---> 13 for entry in feed.find_all('entry', recursive=False)
14 if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
15
AttributeError: 'NoneType' object has no attribute 'find_all'
The project can be accessed here at the following github repo.
github repo herealso.

pyspark how to sum and produce top 10 using pyspark

I have a csv file with two fields, a key and a value:
{1Y4dZ123eAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZ123433MGooBmVzBLUWEZ1234CUY91},8.530366
{1YdZ2344AMGooBmVzBLUWE123JfCCUY91},8.530366
{1YdECDNthiMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBDJTdBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZ123qeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBmVzBLUWEZ2JfCCUY91},8.530366
{1YdZDNHqeAMGooBm123LUWEZ2JfCCUY91},8.530366
{17RJgv5ujkFerSd48Akdd2GneUAW47nphQ},20.0
{17RJgv5ujkFerSd48Akdd2GneUAW47nphQ},20.0
{17RJgv5ujkFerSd48Akdd2GneUAW47nphQ},20.0
{13uZ6tSr5oh1ui9Hd1tEqJKo2AHhJ6JdFS},0.03895804
What I'm trying to do is sum up the second column and group by the first column, then derive the top 10 keys with the highest values.
Below is the code I've tried using but I get a 'tuple index out of range' error:
import re
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.session import SparkSession
sc = pyspark.SparkContext()
spark = SparkSession(sc)
voutFile = sc.textFile("input/voutfiltered.csv")
features=voutFile.map(lambda l:
(l.split(',')[0],float(l.split(',')[1])))
top10 = features.takeOrdered(10, key = lambda x: -x[2])
for record in top10:
print("{}: {};{}".format(record[0],record[1],record[2]))```
Any particular reason why you're not using the DataFrame API? It's much more flexible, convenient and faster than the RDD API.
import pyspark.sql.functions as f
df = spark.read.format("csv").option("header", "true").load("/path/to/your/file.csv/")
(df.groupBy(f.col("key_col"))
.agg(f.count(f.col("value_col")).alias("count_value_col"))
.sort(col("count_value_col").desc())
.limit(10)
.show())

Inserting Labels in Bokeh

I am trying to inser labels in Bokeh and it is not working.
My code is:
from bokeh.io import show, output_file
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.models import NumeralTickFormatter
df_carteira_grafico = df_resumo_1
df_carteira_grafico['mes_status'] = (df_carteira_grafico['mes_juncao'].astype(dtype=str))+' - '+df_carteira_grafico['Atraso']
output_notebook()
p=figure()
carteira = df_carteira_grafico['mes_status']
tamanho = df_resumo_1['Valor a Entregar']
p = figure(x_range=carteira, plot_height=300, title="Status_Carteira")
p.vbar(x=carteira, top=tamanho, width=0.9)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.yaxis[0].formatter = NumeralTickFormatter(format="0.0")
show(p)
I am getting this:
I want to get this:
Tks for the help.
If you put your data in a ColumnDataSource yourself, then that source can be used to drive both the vbar and a LabelSet as demonstrated in the documentation. Something like:
# CDS can also be created directly from data frames, but not clear in your case
source = ColumnDataSource(data=
dict(carteira=carteira, tamanho=tamanho, labels=[str(x) for x in tamanho])
)
p.vbar(x='carteira', top='tamanho', width=0.9, source=source)
labels = LabelSet(x='carteira', y='tamanho', text='labels', y_offset=5, source=source)
p.add_layout(labels)
However please note that I could not actually test this directly, because the example code in your question was not self-contained and complete. Hopefully it points the way, though.
See Providing Data for Plots and Tables for more information about Bokeh data sources.
Got it. For thse who may need in the future, here is the code:
bokeh.io import show, output_file
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.models import NumeralTickFormatter
from numpy import pi
from bokeh.models import ColumnDataSource
from bokeh.models import LabelSet
df_carteira_grafico = df_resumo_1
df_carteira_grafico['mes_status'] = (df_carteira_grafico['mes_juncao'].astype(dtype=str))+' - '+df_carteira_grafico['Atraso']
output_notebook()
p=figure()
carteira = df_carteira_grafico['mes_status']
tamanho = df_resumo_1['Valor a Entregar']
source = ColumnDataSource(data=dict(carteira=carteira, tamanho=tamanho, labels=[str(x) for x in tamanho]))
p = figure(x_range=carteira, plot_height=400, title="Status_Carteira")
p.vbar(x='carteira', top='tamanho', width=0.9, source=source)
labels = LabelSet(x='carteira', y='tamanho', text='labels', y_offset=5, source=source)
p.add_layout(labels)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.yaxis[0].formatter = NumeralTickFormatter(format="0.0")
show(p)

Replacing figure and table in layout when using global ColumnDataSource

I am using bokeh 0.12.9. I have a table and a figure which I replace in the global layout on callback. I usually build the ColumnDataSource right before I build the new figure/table. Now I wanted to try and see if I can have a global ColumnDataSource so that I can adjust the data via a CDSView (no need to replace table/figure then).
Unfortunately even keeping a separate CDS and view for table and plot fails. When clicking the radio button a couple of times I receive the following javascript error:
Uncaught TypeError: Cannot read property 'data' of undefined
from datetime import date
from random import randint
from bokeh.models import Line
import numpy as np
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
import bokeh.layouts as layouts
import bokeh.models.widgets as widgets
from bokeh.io import curdoc
from bokeh.models import CustomJS, Slider
from bokeh import palettes
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, CDSView, IndexFilter
from bokeh.models import widgets
def gen_plot(source=None, view=None):
p = figure(title='test',
x_axis_type="datetime",
plot_width=600, plot_height=400)
colors = palettes.Category10[10]
cols = [str(col) for col in source.column_names]
for ix, col in enumerate(cols):
if col == 'index':
continue
r = p.line(x='index', y=col, source=source, view=view,
legend='_' + col,
color=colors[ix])
p.legend.location = "bottom_left"
return p
def gen_table(source=None, view=None):
columns = [TableColumn(field=ele, title=ele) for ele
in source.column_names]
tab = widgets.DataTable(source=source, view=view, columns=columns,
selectable=False,
reorderable=False,
width=600, height=400)
return tab
def update(attr, old, new):
p = gen_plot(source=cdss[0], view=vs[0])
t = gen_table(source=cdss[1], view=vs[1])
print l.children
l.children[1] = p
l.children[2].children[0] = t
# set up data
cols = ['col1', 'col2', 'col3', 'col4']
df1 = pd.DataFrame(pd.util.testing.getTimeSeriesData())
df1.columns = cols
df2 = pd.DataFrame(pd.util.testing.getTimeSeriesData())
df2.columns = cols
dfs = [df1, df2]
cds1 = ColumnDataSource(df1)
cds2 = ColumnDataSource(df2)
cdss = [cds1, cds2]
filters = [IndexFilter([0, 1, 2, 4])]
filters = []
v1 = CDSView(source=cds1, filters=filters)
v2 = CDSView(source=cds2, filters=filters)
vs = [v1, v2]
# initialize items to replace
p = gen_plot(source=cdss[0], view=vs[0])
t = gen_table(source=cdss[1], view=vs[1])
# initialize controls
radio_wghting = widgets.RadioButtonGroup(labels=["Equal", "Exponential"],
active=0)
radio_wghting.on_change('active', update)
# set up layout
sizing_mode = 'fixed'
l = layout([radio_wghting, p, t], sizing_mode=sizing_mode)
curdoc().add_root(l)
curdoc().title = 'blub'
# call callback initially
update('value', 0, 0)
Any hints are much appreciated!
Now I wanted to try and see if I can have a global ColumnDataSource so
that I can adjust the data via a CDSView (no need to replace
table/figure then).
The code you are showing is the one in which you are trying to replace the figure and table.
When you replace the child of a layout object in that way, you are not actually removing the previous figures from curdoc, and other elements in the document still have the old figures and tables in their references.
You could try something like that to update the sources directly.
for rend in p.renderers:
try:
rend.data_source
except AttributeError:
pass
else:
rend.data_source.data.update(new_data_dictionary)
and
t.source.data.update(new_data_dictionary)
EDIT to answer the comment
from bokeh.io import curdoc
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Button
from bokeh.layouts import gridplot, widgetbox
from random import random, choice
import numpy as np
my_data = {1:{'x':[],'y':[],'colo':[],'size':[]}}
kelly_colors = [ '#F3C300','#875692', '#F38400', '#A1CAF1','#BE0032', '#C2B280', '#848482','#008856', '#E68FAC', '#0067A5',
'#F99379', '#604E97', '#F6A600','#B3446C', '#DCD300', '#882D17','#8DB600', '#654522', '#E25822','#2B3D26', ]
x = np.arange(0,50,0.1)
def rand_dict():
rand_x = [choice(x) for i in range(7)]
return {'x':rand_x,'y':np.array([random()*100 for i in rand_x]),'colo':np.array([choice(kelly_colors) for i in rand_x]),'size':np.array([(5+int(random()*50)) for i in rand_x])}
def add_stuff():
global my_data
my_data[max(my_data.keys())+1] = rand_dict()
make_doc()
def change_stuff():
global my_data
myfig = curdoc().select_one({"name":"myfig"})
for i,rend in enumerate(myfig.renderers):
try:
rend.data_source
except AttributeError:
pass
else:
my_data[i+1] = rand_dict()
rend.data_source.data.update(my_data[i+1])
def clear_stuff():
global my_data
my_data = {1:{'x':[],'y':[],'colo':[],'size':[]}}
make_doc()
def make_doc():
curdoc().clear()
myfig = figure(plot_width=1000,plot_height=800,outline_line_alpha=0,name='myfig')
myfig.x_range.start = -5
myfig.x_range.end = 55
myfig.y_range.start = -10
myfig.y_range.end = 110
myfig.renderers = []
add_button = Button(label='add stuff',width=100)
change_button = Button(label='change stuff',width=100)
clear_button = Button(label='clear stuff',width=100)
add_button.on_click(add_stuff)
change_button.on_click(change_stuff)
clear_button.on_click(clear_stuff)
grid = gridplot([[myfig,widgetbox(add_button,change_button,clear_button)]],toolbar_location=None)
curdoc().add_root(grid)
update_doc()
def update_doc():
myfig = curdoc().select_one({"name":"myfig"})
for key in my_data:
myfig.scatter(x='x',y='y',color='colo',size='size',source=ColumnDataSource(data=my_data[key]))
curdoc().title = 'mytitle'
make_doc()
what I like about doing this is that you can just save the my_data dictionary with numpy, load it later and keep changing your plots from there.
def load_data():
global my_data
my_data = np.load(path_to_saved_data).item()
make_doc()
You can probably do something similar using pandas dataframes, I am just more comfortable with plain dictionaries.

Resources