multiply all columns in data frame in R - r

I need to multiply all columns in a data frame with each other. As an example, I need to achieve the following:
mydata$C1_2<-mydata$sic1*mydata$sic2
but for all my columns with values going from 1 to 733 (sic1, sic2, sic3,..., sic733).
I've tried the following but it doesn't work:
for(i in 1:733){
for(j in 1:733){
mydata$C[i]_[j]<-mydata$sic[i]*mydata$sic[j]
}
}
Could you help me? Thanks for your help.

Despite the question if you really want what you think you want, I feel like this could help:
df <- data.frame(
a = 1:4
, b = 1:4
, c = 4:1
)
multiplyColumns <- function(name1, name2, df){
df[, name1] * df[, name2]
}
combinations <- expand.grid(names(df), names(df), stringsAsFactors = FALSE)
names4result <- paste(combinations[,1], combinations[,2], sep = "_")
result <- as.data.frame(mapply(multiplyColumns, combinations[,1], combinations[,2], MoreArgs = list(df = df)))
names(result) <- names4result
result

Related

problems looping with fastLink

First of all, sorry for my English, I'm translating with google translator
I have two df to which I apply fastLink
df1<-data.frame(col1=c("pruebaA","pruebaA","pruebaA","pruebaB","pruebaB","pruebaB"),col2=c("avion","casa","coche","verde","antonio","jardin"), stringsAsFactors = FALSE)
df2<-data.frame(col1=c("pruebaA","pruebaA","pruebaA","pruebaB","pruebaB","pruebaA"),col2=c("avion","casa grande","coche rojo","Berde","antoƱito","jardinn"), stringsAsFactors = FALSE)
library(fastLink)
prueba <- function(d1, d2) {
out <- fastLink(
dfA = d1, dfB = d2,
varnames = c("col1","col2"),
partial.match = c("col2"),
stringdist.match = c("col2")
)
indi<<- out$matches
dfA.match <<- d1[out$matches$inds.a,]
}
prueba(df1,df2)
I get indi and dfA.match so I can query them.
How could I do the same when I have a lot of df?
I can't make a loop
For example,
I divide df1 and df2 into parts
df1$M <- paste0(df1$col1, "_df1")
z <- split(df1,df1$M )
list2env(z, .GlobalEnv)
df2$M <- paste0(df2$col1,"_df2")
b <- split(df2,df2$M )
list2env(b, .GlobalEnv)
I get
-PruebaA_df1
-PruebaA_df2
-PruebaB_df1
-PruebaB_df1
prueba(pruebaA_df1,pruebaA_df2)
prueba(pruebaB_df1,pruebaB_df2)
works!
Same with a loop
unique(df1$col1)->nom2b
indices<- list()
uniones<- list()
for (i in nom2b){
d1<-paste0(i,"_df1")
d2<-paste0(i,"_df2")
#cat(d1)->d1
#cat(d2)->d2
prueba(d1,d2)
indices[[paste0("modelo",i)]]<-indi
uniones[[paste0("uniones",i)]]<- dfA.match
}
Wrong!!, it doesn't work!!
Assuming you have objects called pruebaA_df1, pruebaA_df2 .... pruebaA_df1000 in your environment, you can use Reduce as :
result <- Reduce(prueba, mget(paste0('pruebaA_df', 1:1000)))

Function to change all variables of factor type to lower case

I need to create a function in order to change all my factor variables to lower case.
I've already done that:
change_lower=function(x){if(is.factor(x)) tolower(x)}
But I think I'm doing something wrong, maybe the if isn't good for what I want. Any ideas?
You can use mutate_if if you want to automatically convert a large number of columns. Be sure to convert to character first (as #DanY pointed out):
library(dplyr)
df <- data.frame(x = c(1,2,3), y = c("A","B","C"), z = c("i","K","l"))
df <- df %>% mutate_if(is.factor, function(x) tolower(as.character(x)))
In base R:
df <- data.frame(x = c(1,2,3), y = c("A","B","C"), z = c("i","K","l"))
ind <- names(df)[sapply(df, is.factor)]
for (i in ind){
df[[i]] <- tolower(as.character(df[[i]]))
}
or
df[,ind] <- lapply(ind, function(x) tolower(as.character(df[[x]])))
# Input data:
df <- data.frame(x = c(1,2,3), y = c("A","B","C"), z = c("i","K","l"))
# Convert factors to lowercase:
df <- lapply(df, function(x){if(is.factor(x)) as.factor(tolower(as.character(x))) else x})
# Proof:
str(df)

How to find specific strings in dataframe using for loop?

I'm using for loop to find all specific strings (df2$x2) in another dataframe (df1$x1) and what my purpose is create new column the df1$test and write the df$x2 value.
For example:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
Y = c(2017,2017,2018,2018,2017),
Sales = c(25,50,30,40,90))
df1$x1 <- as.character(as.factor(df1$x1))
df2 <- data.frame(x2 = c("TE-T6-5","TE-D31L-2","TE-H6-15","EC500","EC20","TE-D31L-2"),
Y = c(2018,2017,2018,2017,2018,2018),
P = c(100,300,200,50,150,300))
df2$x2 <- as.character(as.factor(df2$x2))
for(i in 1:nrow(df2)){
f <- df2[i,1]
df1$test <- ifelse(grepl(f, df1$x1),f,"not found")
}
What should I do after the end of loop? I know that problem is y is refreshing every time. I tried "if" statement to create new data frame and save outputs but it didn't work. It's writing only one specific string.
Thank you in advance.
Expected output:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
output = c("not found","TE-D31L-2","not found","TE-D31L-2","EC20"))
Do you want to have one new column for each string? if that is what you need, your code should be:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
Y = c(2017,2017,2018,2018,2017),
Sales = c(25,50,30,40,90))
df1$x1 <- as.character(as.factor(df1$x1))
df2 <- data.frame(x2 = c("TE-T6-5","TE-D31L-2","TE-H6-15","EC500","EC20","TE-D31L-2"),
Y = c(2018,2017,2018,2017,2018,2018),
P = c(100,300,200,50,150,300))
df2$x2 <- as.character(as.factor(df2$x2))
for(i in 1:nrow(df2)){
f <- df2[i,1]
df1$test <- ""
df1$test<-ifelse(grepl(f, df1$x1),T,F)
colnames(df1) <- c(colnames(df1[1:length(df1[1,])-1]),f)
}
it creates a new column with a temp name and then rename it with the string evaluated. Also i change "not found" for F, but you can use whatever you want.
[EDIT:]
If you want that expected output, you can use this code:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
Y = c(2017,2017,2018,2018,2017),
Sales = c(25,50,30,40,90))
df1$x1 <- as.character(as.factor(df1$x1))
df2 <- data.frame(x2 = c("TE-T6-5","TE-D31L-2","TE-H6-15","EC500","EC20","TE-D31L-2"),
Y = c(2018,2017,2018,2017,2018,2018),
P = c(100,300,200,50,150,300))
df2$x2 <- as.character(as.factor(df2$x2))
df1$output <- "not found"
for(i in 1:nrow(df2)){
f <- df2[i,1]
df1$output[grepl(f, df1$x1)]<-f
}
Very similar of what you have done, but it was needed to index which rows you have to write.
This only works when the data only can have one match, it is a little more complicated if you can have more than one match for row. But i think that's not your problem.
You simply need to split the df1$x1 strings on space and merge (or match since you are only interested in one variable)on df2$x2, i.e.
v1 <- sub('\\s+.*', '', df1$x1)
v1[match(v1, df2$x2)]
#[1] NA "TE-D31L-2" NA "TE-D31L-2" "EC20"

Apply a user defined function to a list of data frames

I have a series of data frames structured similarly to this:
df <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',11:21))
df2 <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',50:60))
In order to clean them I wrote a user defined function with a set of cleaning steps:
clean <- function(df){
colnames(df) <- df[2,]
df <- df[grep('^[0-9]{4}', df$year),]
return(df)
}
I'd now like to put my data frames in a list:
df_list <- list(df,df2)
and clean them all at once. I tried
lapply(df_list, clean)
and
for(df in df_list){
clean(df)
}
But with both methods I get the error:
Error in df[2, ] : incorrect number of dimensions
What's causing this error and how can I fix it? Is my approach to this problem wrong?
You are close, but there is one problem in code. Since you have text in your dataframe's columns, the columns are created as factors and not characters. Thus your column naming does not provide the expected result.
#need to specify strings to factors as false
df <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',11:21), stringsAsFactors = FALSE)
df2 <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',50:60), stringsAsFactors = FALSE)
clean <- function(df){
colnames(df) <- df[2,]
#need to specify the column to select the rows
df <- df[grep('^[0-9]{4}', df$year),]
#convert the columns to numeric values
df[, 1:ncol(df)] <- apply(df[, 1:ncol(df)], 2, as.numeric)
return(df)
}
df_list <- list(df,df2)
lapply(df_list, clean)

Fast and Efficient Way to merge a lot of dataframe in R

I have a large set of dataframes (around 50,000). Each dataframe have two columns, key and value, with around 100-200 rows. My question is essentially similar to this and this. Following their ideas, I construct a list of dataframes and use Reduce function
freq_martix<-Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "key", all = TRUE),
freq_list)
But my code has run for several days. I just wonder if there is a more efficient, faster way to merge a large set of dataframes?
This way is pretty fast.
First of all I created 500 tables, each containing 150 key-value pairs.
library(data.table)
library(stringi)
for (i in 1:500) {
set.seed(i)
dfNam <- paste('df', i, sep = '_')
df <- data.frame( cbind(key = tolower(stri_rand_strings(150, 1, pattern = '[A-Za-z]')), value = sample(1:1000, 150, replace = TRUE)) )
assign(dfNam, df)
rm(df)
rm(dfNam)
}
Then I transposed and append them:
tmp <- data.table()
for (i in ls(pattern = 'df_') ) {
df <- get(i)
dt <- data.table( transpose(df) )
colnames(dt) <- as.character(unlist(dt[1, ]))
dt <- dt[-1, ]
tmp <- rbindlist(list(tmp, dt), use.names = TRUE, fill = TRUE)
}
And transposed back after all:
merged_data <- transpose(tmp)
key <- colnames(tmp)
merged_data <- cbind(key, merged_data)
Works like charm.

Resources