string match within a string then add a column of matched subtrings - string-matching

I have two dataframes df and df_ref:
I'm using the following code:
df['match']=df[df['swift_codes'].str.contains('|'.join(df_ref['Hfrase_best_search_string']))]
Which gives me this result:
The result I'd like to see is this:

so is was able to resolve this with the following, but i really start the match and merge process in the code where it's labelled:
"# CREATE DATAFRAMES WITH TARGETED VALUES TO COMPARE AND MERGE TO ULTIMATELY______________________________________________________________
# YIELD A NEW DATAFRAME WITH THE 'BANK_SHORT_NAME FROM TEAM'S RECORDS__________________________________________________________________
def find_swift(df, col_name = 'ORGNTR_BNK_NM'):"
"""Reads in raw data and isolates possible new swift codes which are then analysed
to determine if they are in fact new swift codes. To support decisions, extracted
11-character substrings codes are referenced against established swift code substrings
already compiled in records. Then the function extracts bank codes and country codes
and put them in their own columns. The function will produces a Excel file with two
tabs, one with swift codes and it associate bank name and another with the rejected
strings"""
# BEGIN CLEANING AND ORGANIZING RAW DATA, SPLIT SUBTRINGS TO THEIR OWN ROWS_____________________________________________________
# AND THEN STACK ALL STRINGS ON TOP OF EACH OTHER________________________________________________________________________________
df_col = df[col_name].dropna()
df_col = df_col.to_frame()
# take out all non letters and digits from raw data
df_col = df_col.iloc[:, 0].str.replace(r'[^0-9a-zA-Z]', ' ')
df_col = df_col.to_frame()
# perform boolean test to see if elements are all numeric or not, results saved in new column 'numeric'
# create a new dataframe with only non-numeric elements
df_col['numeric'] = df_col.iloc[:, 0].map(lambda x: all(i.isdecimal() for i in x.split()))
df_non_num = df_col[df_col['numeric']==False]
df_col = df_col.drop(columns=['numeric'])
# count number of splits and assign maximums splits to a variable
# so they can be put into a new dataframe where each substring
# has their own column
count=(df_non_num.iloc[:, 0].str.split().str.len())
count=count.to_frame()
max_cols = count.iloc[:, 0].max()
# create a new dataframe where each substring has their own column
df_split = df_non_num.iloc[:, 0].str.split(' ', max_cols, expand=True)
# BEGIN EXTRACTING 11-CHARACTER STRINGS FOR ANALYSIS ____________________________________________________________________________
# APPEND ALL THE RESULTING DATAFRAMES INTO ONE _______________________________________________________________________________
# select only the 11-character strings from each split column and saves the
# columns as a list of series in the variable splits
splits = list()
for column in df_split:
df_split[column]=df_split[column][(df_split[column].str.len() == 11)]
df_split[column] = df_split[column].to_frame()
splits.append(df_split[column])
# drop NaN columns and remove duplicates and save it to list split2
split2=list()
for series in splits:
series = series.dropna()
series = series.drop_duplicates()
split2.append(series)
# iterates over each series to identify all the probable
# swift codes by comparing portions of its strings to
# verify they correspond to bank codes and country codes
# found in precompiled records (df1) in each series, which are all
# stored in the list split3
split3=list()
for series in split2:
series = series[(series.str[0:4].isin(df1['swift_bank_code'])&series.str[4:6].isin(df2['ISO ALPHA-2 Code']))|
(series.str[8:11].isin(df1['swift_branch_code'])&series.str[0:4].isin(df1['swift_bank_code']))|
(series.str[4:6].isin(df2['ISO ALPHA-2 Code'])&series.str[8:11].isin(df1['swift_branch_code']))]
series = series.drop_duplicates()
split3.append(series)
# append everything together in dataframe3 by first creating
# an empty series to hold all the series, then save as a dataframe
s = pd.Series()
s = s.append(split3)
s = s.to_frame()
s.columns = ['swift_codes']
s = s['swift_codes'].dropna()
s = s.astype(str)
s = s.to_frame()
# CREATE DATAFRAMES WITH TARGETED VALUES TO COMPARE AND MERGE TO ULTIMATELY______________________________________________________________
# YIELD A NEW DATAFRAME WITH THE 'BANK_SHORT_NAME FROM TEAM'S RECORDS__________________________________________________________________
# create dataframes from appended series for portions of strings specified
# below to facilitate merging with team's records (df1)
s_four=s.swift_codes.str[0:4]
s_four=s_four.to_frame()
s_six=s['swift_codes'].str[0:6]
s_six=s_six.to_frame()
s_eight=s['swift_codes'].str[0:8]
s_eight=s_eight.to_frame()
s_ten=s['swift_codes'].str[0:10]
s_ten=s_ten.to_frame()
s_eleven=s['swift_codes'].str[0:11]
s_eleven=s_eleven.to_frame()
# create a dataframe from df1 with only the 'short bank name' and
# 'best search string' columns for easier dataframe management
df1b = df1[['Hfrase_short_name', 'Hfrase_best_search_string']]
# create dataframes from previously compiled records to facilitate
# comparison and merging with identified
df1_11 = df1b[df1b['Hfrase_best_search_string'].str.len()==11]
df1_10 = df1b[df1b['Hfrase_best_search_string'].str.len()==10]
df1_8 = df1b[df1b['Hfrase_best_search_string'].str.len()==8]
df1_6 = df1b[df1b['Hfrase_best_search_string'].str.len()==6]
df1_4 = df1b[df1b['Hfrase_best_search_string'].str.len()==4]
# perform merge between each of the newly created, corresponding
# dataframes, the merge creates a new dataframe with the "bank_short_name"
# from previously compiled records
s_eleven=s_eleven.reset_index().merge(df1_11, how='left', left_on=['swift_codes'], right_on=['Hfrase_best_search_string']).set_index('index')
s_ten=s_ten.reset_index().merge(df1_10, how='left', left_on=['swift_codes'], right_on=['Hfrase_best_search_string']).set_index('index')
s_eight=s_eight.reset_index().merge(df1_8, how='left', left_on=['swift_codes'], right_on=['Hfrase_best_search_string']).set_index('index')
s_six=s_six.reset_index().merge(df1_6, how='left', left_on=['swift_codes'], right_on=['Hfrase_best_search_string']).set_index('index')
s_four=s_four.reset_index().merge(df1_4, how='left', left_on=['swift_codes'], right_on=['Hfrase_best_search_string']).set_index('index')
# assigned new columns to all the dataframes then stack them
# together to produce a new dataframe.
all_dfs = [s_four, s_six, s_eight, s_ten, s_eleven]
swift_result = pd.concat(all_dfs)
# drop NaN and duplicate values from the and sort by index
swift_result = swift_result.dropna()
swift_result = swift_result.drop_duplicates()
swift_result = swift_result.sort_index()
# prepare dataframe, swift_result, then merge with all the swift codes found earlier
# select only 'bank_short_name' in the new dataframe of the same name
swift_result = swift_result[['Hfrase_short_name']]
swift_bank=s.join(swift_result, how='left', lsuffix='_original')
swift_bank.columns = ['swift_codes', 'bank_short_name']
# create new columns to store country code and bank code extracted from column mega
swift_bank['swift_country_code'] = swift_bank['swift_codes'].str[4:6]
swift_bank['swift_bank_code'] = swift_bank['swift_codes'].str[0:4]
# drop duplicates
swift_bank = swift_bank.drop_duplicates()
# define a variable for DataFrame(swift_bank)
swift = pd.DataFrame(swift_bank)
# merge df and swift on the index and drop column 'numeric' from dataframe new_df
new_df = pd.merge(df,swift, how='outer', left_index=True, right_index=True)
new_skinny_df = pd.merge(df_col,swift, how='outer', left_index=True, right_index=True)
return (swift,new_df,new_skinny_df)
I know it's a lot but I can explain. Let me know if anyone wants an explanation.
Where I begin to break both files down for some merging operations to get the result I want starts at:

Related

How to select a particular dataframe from a list of dataframes in Python equivalent to R?

I have a list of dataframes in R, with which I'm trying to select a particular dataframe as follows:
x = listOfdf$df1$df2$df3
Now, trying hard to find an equivalent way to do so in Python. Like, the syntax on how a particular DataFrame be selected from a list of DataFrames in Pandas Python.
Found a solution to select a particular dataframe/dataframe_column from a list of dataframes.
In R : x = listOfdf$df1$df2$df3
In Python : x = listOfdf['df1']['df2']['df3']
Thank you :)
I see you've already answered your own question, and that's cool. However, as jezrael hints in his comment, you should really consider using a dictionary. That might sound a bit scary coming from R (been there myself, now I prefer Python in most ways), but It will be worth your effort.
First of all, a dictionary is a way of mapping a value or variable to a key (like a name). You use curly brackets { } to build the dictionary, and use square brackets [ ] to index it.
Let's say that you have two dataframes like this:
np.random.seed(123)
# Reproducible input - Dataframe 1
rows = 10
df_1 = pd.DataFrame(np.random.randint(90,110,size=(rows, 2)), columns=list('AB'))
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=rows).tolist()
df_1['dates'] = datelist
df_1 = df_1.set_index(['dates'])
df_1.index = pd.to_datetime(df_1.index)
##%%
# Reproducible input - Dataframe 2
rows = 10
df_2 = pd.DataFrame(np.random.randint(10,20,size=(rows, 2)), columns=list('CD'))
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=rows).tolist()
df_2['dates'] = datelist
df_2 = df_2.set_index(['dates'])
df_2.index = pd.to_datetime(df_2.index)
With a limited number of dataframes you can easily organize them in a dictionary this way:
myFrames = {'df_1': df_1,
'df_2': df_2}
Now you have a reference to your dataframes, as well as your own defined names or keys. You'll find a more elaborate explanation here.
Here's how you use it:
print(myFrames['df_1'])
You can also use that reference to make changes to one of your dataframes, and add that to your dictionary:
df_3 = myFrames['df_1']
df_3 = df_3*10
myFrames.update({'df_3': df_3})
print(myFrames)
Now lets say that you have a whole bunch of dataframes that you'd like to organize the same way. You can make a list of the names of all available dataframes like described below. However, you should be aware that using eval() for many reasons often is not recommended.
Anyway, here we go: First you get a list of strings of all dataframe names like this:
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
It's more than likely that you won't be interested in ALL of them if you've got a lot going on at the same time. So lets say that the names of all your dataframes of particluar interest start with 'df_'. You can isolate them like this:
dfNames = []
for elem in alldfs:
if str(elem)[:3] == 'df_':
dfNames.append(elem)
Now you can use that list in combination with eval() to make a dictionary:
myFrames2 = {}
for dfName in dfNames:
myFrames2[dfName] = eval(dfName)
Now you can loop through that dictionary and do something with each of them.
You could, as an example, take the last column of each dataframe, multiply by 10, and make a new dataframe with those values:
j = 1
for key in myFrames.keys():
# Build new column names for your brand new df
colName = []
colName.append('column_' + str(j))
if j == 1:
# First, make a new df by referencing the dictionary
df_new = myFrames2[key]
# Subset the last column and make sure it doesn't
# turn into a pandas series instead of a dataframe in the process
df_new = df_new.iloc[:,-1].to_frame()
# Set new column names
df_new.columns = colName[:]
else:
# df_new already exists, so you can add
# new columns and names for the rest of the columns
df_new[colName] = myFrames2[key].iloc[:,-1].to_frame()
j = j + 1
print(df_new)
Hope you'll find this useful!
And by the way... For your next question, please provide some reproducible code as well as a few words about what solutions you have tried yourself. You can read more about how to ask an excellent question here.
And here is the whole thing for an easy copy&paste:
#%%
# Imports
import pandas as pd
import numpy as np
np.random.seed(123)
# Reproducible input - Dataframe 1
rows = 10
df_1 = pd.DataFrame(np.random.randint(90,110,size=(rows, 2)), columns=list('AB'))
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=rows).tolist()
df_1['dates'] = datelist
df_1 = df_1.set_index(['dates'])
df_1.index = pd.to_datetime(df_1.index)
##%%
# Reproducible input - Dataframe 2
rows = 10
df_2 = pd.DataFrame(np.random.randint(10,20,size=(rows, 2)), columns=list('CD'))
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=rows).tolist()
df_2['dates'] = datelist
df_2 = df_2.set_index(['dates'])
df_2.index = pd.to_datetime(df_2.index)
print(df_1)
print(df_2)
##%%
# If you dont have that many dataframes, you can organize them in a dictionary like this:
myFrames = {'df_1': df_1,
'df_2': df_2}
# Now you can reference df_1 in that collecton by using:
print(myFrames['df_1'])
# You can also use that reference to make changes to one of your dataframes,
# and add that to your dictionary
df_3 = myFrames['df_1']
df_3 = df_3*10
myFrames.update({'df_3': df_3})
# And now you have a happy little family of dataframes:
print(myFrames)
##%%
# Now lets say that you have whole bunch of dataframes that you'd like to organize the same way.
# You can make a list of the names of all available dataframes like this:
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
##%%
# It's likely that you won't be interested in all of them if you've got a lot going on.
# Lets say that all your dataframes of interest start with 'df_'
# You get them like this:
dfNames = []
for elem in alldfs:
if str(elem)[:3] == 'df_':
dfNames.append(elem)
##%%
# Now you can use that list in combination with eval() to make a dictionary:
myFrames2 = {}
for dfName in dfNames:
myFrames2[dfName] = eval(dfName)
##%%
# And now you can reference each dataframe by name in that new dictionary:
myFrames2['df_1']
##%%
#Loop through that dictionary and do something with each of them.
j = 1
for key in myFrames.keys():
# Build new column names for your brand new df
colName = []
colName.append('column_' + str(j))
if j == 1:
# First, make a new df by referencing the dictionary
df_new = myFrames2[key]
# Subset the last column and make sure it doesn't
# turn into a pandas series instead for a dataframe in the process
df_new = df_new.iloc[:,-1].to_frame()
# Set new column names
df_new.columns = colName[:]
else:
# df_new already exists, so you can add
# new columns and names for the rest of the columns
df_new[colName] = myFrames2[key].iloc[:,-1].to_frame()
j = j + 1
print(df_new)

Combine the name of 2 data frames and assign it ti third in R

I have written a function to combine to join 2 dataframes. But the joined file which I am getting should be stored in name format such as "x_city".
test<- function(x,m=csv){
a=as.data.frame(m, stringsAsFactors = FALSE)
b=a[,-c(1)]
x$long=as.numeric(x$long)
x$long=round(x$long,5)
x$lat=as.numeric(x$lat)
x$lat=round(x$lat,5)
df_name<-paste()
name<-paste(x,"city", sep="_")
name=join(x=x ,y=b, by = c("long","lat"))
}
test(abc,m=csv)
So, final data frame should be "abc_city".
Thanks in advance!!
I think you are saying that you have a data.frame called, for instance, abc and you would like to use dplyr's join to join it onto another data.frame, m, and give it a new name which is the name of the first data.frame (abc) followed by '_city'. So in your example you would end up with a data.frame called abc_city.
So the variables that you create in your function only exist in the scope of the function - the call test(abc, m = csv) will print out the value of name name if you add a return statement but it won't store it in a variable. In order to store it in a variable called abc_city you would use
abc_city <- test(abc, m = csv)
Building on Chi Pak's answer, you could create a dataframe in the parent frame using:
test<- function(x,m=csv){
nm <-deparse(substitute(x))
# a=as.data.frame(m, stringsAsFactors = FALSE)
# b=a[,-c(1)]
# x$long=as.numeric(x$long)
# x$long=round(x$long,5)
# x$lat=as.numeric(x$lat)
# x$lat=round(x$lat,5)
assign(paste(nm, "city", sep="_"), join(x=x ,y=b, by = c("long","lat")), envir = parent.frame())
return(get(paste(x, "city", sep="_")))
}
and call it as in your question
test(abc, m=csv)

Add New Field to File Names in Directory - R

I am pulling 10-Ks off the SEC website using the EDGAR package in R. Fortunately, the text files come with a consistent file naming convention: CIK number (this is a unique filing ID)_File type_Date.
Ultimately I want to analyze these by SIC/industry group, so I think the best way to do this would be to add the SIC industry code to this filename rule.
I am including an image of what I would like to do below. It is kind of like a database join except my file names would be taking the new field. Not sure how to do that, I am pretty new to R and file scripting.
I am assuming that you have a data.frame with a column filenames. (Or a vector containing all the filenames) See the code below:
# A data.frame with a character column 'filenames'
df$CIK <- sapply(df$filenames, FUN = function(x) {unlist(strsplit(x, split = "_"))[1]})
df$CIK <- as.character(df$CIK)
Now, let us assume that you have another data.frame with two columns: CIK and SIC.
# A data.frame with two character columns: 'CIK' and 'SIC'
# df2.
#
# We add another column to the first data.frame: 'new_filenames'
df$new_filename <- sapply(1:nrow(df), FUN = function(idx, CIK, filenames, df2) {
SIC <- df2$SIC[which(df2$CIK == CIK[idx])]
new_filename <- as.character(paste(SIC, "_", filenames[idx], sep = ""))
new_filenames
}, CIK = df$CIK, filenames = df$filenames, df2 = df2)
# Now the new filenames are available in df$new_filenames
View(df)

How to import large dataset in r splitting and filtering by 3 different criteria when found

I'm dealing with a couple of txt files with climatological data with 3 parameters that differentiate each chunk of data (Parameter measured, station of measurement, and year), each file has more than a million lines, In the past I mannualy selected each parameter one a time, for a station and year and read it into r using read.fwd; but with this size files that is absurd and inefficient. Is there any way to automate this process, taking into account that the file has a "FF" as indicator every time a new parameter for a station and a given year starts and knowing that i want to generate separate files or datasets that have to be named according to the station, year and parameter to be able to use it thereafter?
File to read Format
Circled in red is the FF, I guess intended to mark the start of a new set of records.
Circled in Black is the name of the parameter measured (there are in total 8 different parameter classes)
Circled in blue is the year of meassurement.
Circled in green is the number or identifier of the station of measurement.
In the past, i read just what i need it with read.fwf, given the fixed with in the data, but that separation is not applied in the head of each table.
PRUEBA3 <- read.fwf("SanIgnacio_Pmax24h.txt", header = FALSE, widths = c(5,4,4,6,2,7,2,7,2,7,2,7,2,7,2,7,2,7,2,7,2,7,2,7,2,7,2,10,2),skip=1)
Thanks, and every help will be appreciated
You will need to make a function that loops through the txt files. (The output that you linked to was produced by a database; I assume you don't have access to it).
Here is how the function could look like using the fast fread from data.table and a foreach loop (you can make the loop parallel by registering a parallel backend and change %do% into %dopar%):
library(data.table)
library(foreach)
myfiles = dir(pattern = ".txt$")
res = foreach(i = 1:myfiles) %dopar% {
x = fread(myfiles[i], na.strings = c("", " "))
# get row indices for start and end dates
# the "V" variables are column indices, I assume these don't change per file
start.dia = x[, grep("DIA", V2)] + 2
end.dia = x[, grep("MEDIA", V2)] - 2
# get name of station
estacion.detect = x[, grep("ESTACION", V9)]
estacion.name = x[estacion.detect, V10]
mydf = x[start.dia : end.dia, estacion := estacion.name]
# remove empty rows and columns
junkcol = which(colSums(is.na(mydf)) == nrow(mydf))
junkrow = which(rowSums(is.na(mydf)) == ncol(mydf))
if (length(junkcol) > 0) {
mydf = mydf[, !junkcol, with = F]
}
if (length(junkrow) > 0) {
mydf = mydf[!junkrow, ]
}
# further data cleaning
}
# bind all files
all = rbindlist(res)

Sort strings based on number in part of string

I have a huge data that I cannot upload here because.
I have two types of columns, their names start with T.H.L or T.H.L.varies..... Both types have are numbered in the format So####, e.g., T.H.L.So1_P1_A2 until T.H.L.So10000_P1_A2.
For each T.H.L column there is a column named T.H.L.varies.... with the same ending.
I want to order the columns by the numbers after So, with first the T.H.L and then the corresponding T.H.L.varies.... version for each So number.
What I tried was to do
library(gtools)
mySorted<- df2[,mixedorder(colnames(df2))]
Which is close, it sorts them correctly by number, but first all T.H.L and then all T.H.L.varies instead of alternating them.
I have posted the column names to Github:
Okay, let's call the names of your data frame (the names you want to reorder) x:
x = names(df2)
# first remove the ones without numbers
# because we want to use the numbers for ordering
no_numbers = c("T.H.L", "T.H.L.varies....")
x = x[! x %in% no_numbers]
# now extract the numbers so we can order them
library(stringr)
x_num = as.numeric(str_extract(string = x, pattern = "(?<=So)[0-9]+"))
# calculate the order first by number, then alphabetically to break ties
ord = order(x_num, x)
# verify it is working
head(c(no_numbers, x[ord]), 10)
# [1] "T.H.L" "T.H.L.varies...." "T.H.L.So1_P1_A1"
# [4] "T.H.L.varies.....So1_P1_A1" "T.H.L.So2_P1_A2" "T.H.L.varies.....So2_P1_A2"
# [7] "T.H.L.So3_P1_A3" "T.H.L.varies.....So3_P1_A3" "T.H.L.So4_P1_A4"
# [10] "T.H.L.varies.....So4_P1_A4"
# finally, reorder your data frame columns
df2 = df2[, c(no_numbers, x[ord])]
And you should be done.

Resources