Extract data.frame column names from rows in data.frame - r

I have a couple of data.frames which have approximately the same structure. For a reproducible example I created two sample dataframes df1 and df2.
df1 <- structure(list(sample = c(2L, 6L), data1 = c(56L, 78L), data2 = c(59L,
27L), data6 = c(90L, 28L), data1namet = structure(c(1L, 1L), .Label = "Sam1", class = "factor"),
data2namab = structure(c(1L, 1L), .Label = "Test2", class = "factor"),
dataame = structure(c(1L, 1L), .Label = "Ex3", class = "factor")), .Names = c("sample",
"data1", "data2", "data3", "data1namet", "data2namab", "dataame"
), class = "data.frame", row.names = c(NA, -2L))
df1
sample data1 data2 data3 data1namet data2namab dataame
1 2 56 59 90 Sam1 Test2 Ex3
2 6 78 27 28 Sam1 Test2 Ex3
df2 <- structure(list(sample = c(12L, 13L, 17L), data1 = c(56L, 78L,
3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L, 1L,
1L), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor")), .Names = c("sample",
"data1", "data2", "datest", "dattestr"), class = "data.frame", row.names = c(NA,
-3L))
df2
sample data1 data2 datest dattestr
1 12 56 59 Exa9 cz1
2 13 78 27 Exa9 cz1
3 17 3 2 Exa9 cz1
The name of the data is saved in the columns after the data columns and I was wondering if there is a way I could restructure the data.frames (about 40 data.frames) that they contain the name of the data in their column name?
df1
sample Sam1 Test2 Ex3
1 2 56 59 90
2 6 78 27 28
and
df2
sample Exa9 cz1
1 12 56 59
2 13 78 27
3 17 3 2
EDIT
As I just realised I also have other columns after the data columns so that my input data looks like this
df1 <- structure(list(sample = c(2L, 6L), data1 = c(56L, 78L), data2 = c(59L,
27L), data3 = c(90L, 28L), data1namet = structure(c(1L, 1L), .Label = "Sam1", class = "factor"),
data2namab = structure(c(1L, 1L), .Label = "Test2", class = "factor"),
dataame = structure(c(1L, 1L), .Label = "Ex3", class = "factor"),
ma = c("Jay", "Jay")), .Names = c("sample", "data1", "data2",
"data3", "data1namet", "data2namab", "dataame", "ma"), row.names = c(NA,
-2L), class = "data.frame")
df1
sample data1 data2 data3 data1namet data2namab dataame ma
1 2 56 59 90 Sam1 Test2 Ex3 Jay
2 6 78 27 28 Sam1 Test2 Ex3 Jay
df2 <- structure(list(sample = c(12L, 13L, 17L), data1 = c(56L, 78L,
3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L, 1L, 1L
), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor"), add = c(2, 2, 2)), .Names = c("sample",
"data1", "data2", "datest", "dattestr", "add"), row.names = c(NA,
-3L), class = "data.frame")
df2
sample data1 data2 datest dattestr add
1 12 56 59 Exa9 cz1 2
2 13 78 27 Exa9 cz1 2
3 17 3 2 Exa9 cz1 2
In this case the ma and add column are not part of the data and should be added at the end like this:
df1
sample Sam1 Test2 Ex3 ma
1 2 56 59 90 Jay
2 6 78 27 28 Jay
and
df2
sample Exa9 cz1 add
1 12 56 59 2
2 13 78 27 2
3 17 3 2 2

One could start by identifying which columns should be kept:
keep_col <- which(sapply(df2, is.numeric))
After that, some work is required to extract the new column names and to rename the corresponding columns in the data frame:
names <- df2[1,keep_col[-1] + length(keep_col)-1]
colnames(df2)[keep_col[-1]] <- as.character(unlist(names))
Finally, the dataframe can be reassembled by keeping only the desired columns:
df2 <- df2[,keep_col]
#> df2
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
In order to use this transformation for several different dataframes, the code can be wrapped into a function:
summarize_table <- function(x){
keep_col <- which(sapply(x, is.numeric))
names <- x[1,keep_col[-1] + length(keep_col)-1]
colnames(x)[keep_col[-1]] <- as.character(unlist(names))
x <- x[,keep_col]
}
If the various dataframes are stored in a list, the function summarize_table() can be used with lapply() to obtain the results for each dataframe:
my_dfs <- list(df1,df2)
out <- lapply(my_dfs,summarize_table)
#> out
#[[1]]
# sample Sam1 Test2 Ex3
#1 2 56 59 90
#2 6 78 27 28
#
#[[2]]
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
EDIT / ADDENDUM
The modified version below should be able to handle also the cases mentioned in the revised post:
summarize_tab2 <- function(x){
keep_col <- which(sapply(x, is.numeric))
first_block <- c(keep_col[1],keep_col[which(diff(keep_col)==1)])
add_col <- FALSE
if (2 * (length(keep_col) - 1) + 1 < ncol(x)) add_col <- TRUE
keep_col1 <- keep_col[1:length(first_block)]
names <- x[1,keep_col1[-1] + length(keep_col1) - 1]
colnames(x)[keep_col1[-1]] <- as.character(unlist(names))
df_t <- x[,keep_col]
if (add_col) df_t <- cbind(df_t, x[(2 * (ncol(df_t) - 1) + 2):ncol(x)])
return(df_t)
}
my_dfs <- list(df1, df2, df3, df4)
out <- lapply(my_dfs, summarize_tab2)
#> out
#[[1]]
# sample Sam1 Test2 Ex3 ma
#1 2 56 59 90 Jay
#2 6 78 27 28 Jay
#
#[[2]]
# sample Exa9 cz1 add
#1 12 56 59 2
#2 13 78 27 2
#3 17 3 2 2
#
#[[3]]
# sample Sam1 Test2 Ex3
#1 2 56 59 90
#2 6 78 27 28
#
#[[4]]
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
Here the dataframes df3 and df4 are, respectively, the data frames df1and df2 of the original post.

The following should work:
library(plyr)
cols.to.rename <- grep('^data(.)$', colnames(df1))
cols.of.names <- max(cols.to.rename)+seq(1,length(cols.to.rename))
the.names <- lapply(df1[1,cols.of.names], as.character)
df1.mod <- df1
colnames(df1.mod)[cols.to.rename] <- the.names
df1.mod <- df1.mod[-cols.of.names]
It renames all dataX columns to the (first) value in the columns following the last dataX column. It then drops all name columns from the data frame.

Related

one hot encoding only factor variables in R recipes

I have a dataframe df like so
height age dept
69 18 A
44 8 B
72 19 B
58 34 C
I want to one-hot encode only the factor variables (only dept is a factor). How can i do this?
Currently right now I'm selecting everything..
and getting this warning:
Warning message:
The following variables are not factor vectors and will be ignored: height, age
ohe <- df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect::everything()) %>%
recipes::prep() %>%
recipes::bake(df)
Use the where with is.factor instead of everything
library(dplyr)
df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect:::where(is.factor)) %>%
recipes::prep() %>%
recipes::bake(df)
-output
# A tibble: 4 × 4
height age dept_B dept_C
<int> <int> <dbl> <dbl>
1 69 18 0 0
2 44 8 1 0
3 72 19 1 0
4 58 34 0 1
data
df <- structure(list(height = c(69L, 44L, 72L, 58L), age = c(18L, 8L,
19L, 34L), dept = structure(c(1L, 2L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")

How I can split and sort this data set

Here is a small sample size of my data:
var colour no Mcolour Ncolour
sa1_fr_19 B 10 66 3
sa1_fr_19 W 12 85 6
su3_sa2_18 B 8 70 9
su3_sa2_18 W 6 24 1
I want to get this table:
year var sort nB McolourB NcolourB nW McolourW NcolourW
19 sa1 fr 10 66 3 12 85 6
18 su3 sa2 8 70 9 6 24 1
It would be good if we could do it using base R codes
Split the columns on '_' and use pivot_wider.
library(magrittr)
library(tidyr)
df %>%
separate(var, c('var', 'sort', 'year'), sep = '_') %>%
pivot_wider(names_from = colour, values_from = c(no, Mcolour, Ncolour), names_sep = '')
# var sort year noB noW McolourB McolourW NcolourB NcolourW
# <chr> <chr> <chr> <int> <int> <int> <int> <int> <int>
#1 sa1 fr 19 10 12 66 85 3 6
#2 su3 sa2 18 8 6 70 24 9 1
data
df <- structure(list(var = c("sa1_fr_19", "sa1_fr_19", "su3_sa2_18",
"su3_sa2_18"), colour = c("B", "W", "B", "W"), no = c(10L, 12L,
8L, 6L), Mcolour = c(66L, 85L, 70L, 24L), Ncolour = c(3L, 6L,
9L, 1L)), class = "data.frame", row.names = c(NA, -4L))
Using data.table
library(splitstackshape)
library(data.table)
dcast(cSplit(df, "var", sep="_"), var_1 + var_2 + var_3 ~ colour,
value.var = c("no", "Mcolour", "Ncolour"))
var_1 var_2 var_3 no_B no_W Mcolour_B Mcolour_W Ncolour_B Ncolour_W
1: sa1 fr 19 10 12 66 85 3 6
2: su3 sa2 18 8 6 70 24 9 1
data
df <- structure(list(var = c("sa1_fr_19", "sa1_fr_19", "su3_sa2_18",
"su3_sa2_18"), colour = c("B", "W", "B", "W"), no = c(10L, 12L,
8L, 6L), Mcolour = c(66L, 85L, 70L, 24L), Ncolour = c(3L, 6L,
9L, 1L)), class = "data.frame", row.names = c(NA, -4L))

how to calculate a specific subset in dataframe in r and save the calculation in another list

I have two lists:
list 1:
id name age
1 jake 21
2 ashly 19
45 lana 18
51 james 23
5675 eric 25
list 2 (tv watch):
id hours
1 1.1
1 3
1 2.5
45 5.6
45 3
51 2
51 1
51 2
this is just an example, the real lists are very big :list 1 - 5000 id's, list 2/3/4 - has more then 1 million rows (not a unique id).
I need for every list 2 and up to calculate average/sum/count to every id and add the value to list 1.
notice that I need the calculation saved in another list with different row numbers.
example:
list 1:
id name age tv_average
1 jake 21 2.2
2 ashly 19 n/a
45 lana 18 4.3
51 james 23 1.6667
5675 eric 25 n/a
this are my tries:
for (i in 1:nrow(list2)) {
p <- subset(list2,list2$id==i)
list2$tv_average[i==list2$id] <- sum(p$hours)/(nrow(p))
}
error:
out of 22999 rows it only work on 21713 rows.
Try this
#Sample Data
data1 = structure(list(id = c(1L, 2L, 45L, 51L, 5675L), name = structure(c(3L,
1L, 5L, 4L, 2L), .Label = c("ashly", "eric", "jake", "james",
"lana"), class = "factor"), age = c(21L, 19L, 18L, 23L, 25L)
), .Names = c("id",
"name", "age"), row.names = c(NA, -5L), class = "data.frame")
data2 = structure(list(id = c(1L, 1L, 1L, 3L, 45L, 45L, 51L, 51L, 51L,
53L), hours = c(1.1, 3, 2.5, 10, 5.6, 3, 2, 1, 2, 6)), .Names = c("id",
"hours"), class = "data.frame", row.names = c(NA, -10L))
# Use aggregate to calculate Average, Sum, and Count and Merge
merge(x = data1,
y = aggregate(hours~id, data2, function(x)
c(mean = mean(x),
sum = sum(x),
count = length(x))),
by = "id",
all.x = TRUE)
# id name age hours.mean hours.sum hours.count
#1 1 jake 21 2.200000 6.600000 3.000000
#2 2 ashly 19 NA NA NA
#3 45 lana 18 4.300000 8.600000 2.000000
#4 51 james 23 1.666667 5.000000 3.000000
#5 5675 eric 25 NA NA NA

How to merge two data.frames by the first part of a names in a column?

I have two text files:
f1
A B
sam 23
dam 90
f2
G A K
43 DQF_df_gf 65
54 sam_df_bnol 90
56 jay_df_nkol 89
67 dam_df_etr 43
45 fds_df_lpko 78
As you can see here I have two names in f1$A which I want to use to meg with f2 by A . the problem is that they share only the the first parts with f2$A.
Using :
gh=merge(f1,f2,by="A",all=TRUE)
will give no error but rows of NA
desired output:
A B G K
sam 23 54 90
dam 90 67 43
You have to extract the first parts of column "A" in f2 so that you can use it to merge by. If you want to do that in one step including the merge, you could use:
merge(f1, transform(f2, A = sub("^([^_]+).*", "\\1", A)), by = "A")
# A B G K
#1 dam 90 67 43
#2 sam 23 54 90
This doesn't require adding a new column to f2. It only changes a copy of f2 that is lost after the merge, i.e. f2 remains unchanged.
You can see that I did this extraction inside the merge:
transform(f2, A = sub("^([^_]+).*", "\\1", A))
# G A K
#1 43 DQF 65
#2 54 sam 90
#3 56 jay 89
#4 67 dam 43
#5 45 fds 78
Another check:
x <- c("KR.S._", "SS#2_")
sub("^([^_]+).*", "\\1", x)
#[1] "KR.S." "SS#2"
Sample data used:
f1 <- structure(list(A = structure(c(2L, 1L), .Label = c("dam", "sam"
), class = "factor"), B = c(23L, 90L)), .Names = c("A", "B"), class = "data.frame", row.names = c(NA,
-2L))
f2 <- structure(list(G = c(43L, 54L, 56L, 67L, 45L), A = structure(c(2L,
5L, 4L, 1L, 3L), .Label = c("dam_df_etr", "DQF_df_gf", "fds_df_lpko",
"jay_df_nkol", "sam_df_bnol"), class = "factor"), K = c(65L,
90L, 89L, 43L, 78L)), .Names = c("G", "A", "K"), class = "data.frame", row.names = c(NA,
-5L))
You can use merge() to achieve what you want, but instead of using df2, you can merge the df1 data frame with a data frame consisting of f2 along with a temporary column for merging:
gh <- merge(f1,
cbind(f2, merge=gsub('^(.*?)_.*', '\\1', f2$A)),
by.x=c("A"),
by.y=c("merge"))
> gh
A B G A K
1 dam 90 67 dam_df_etr 43
2 sam 23 54 sam_df_bnol 90

R: Extract list columns based on column names and patterns

I have a list (here only sample data)
my_list <- list(structure(list(sample = c(2L, 6L), data1 = c(56L, 78L),
data2 = c(59L, 27L), data3 = c(90L, 28L), data1namet = structure(c(1L,
1L), .Label = "Sam1", class = "factor"), data2namab = structure(c(1L,
1L), .Label = "Test2", class = "factor"), dataame = structure(c(1L,
1L), .Label = "Ex3", class = "factor"), ma = c("Jay", "Jay"
)), .Names = c("sample", "data1", "data2", "data3", "data1namet",
"data2namab", "dataame", "ma"), row.names = c(NA, -2L), class = "data.frame"),
structure(list(sample = c(12L, 13L, 17L), data1 = c(56L,
78L, 3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L,
1L, 1L), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor"), add = c(2, 2,
2)), .Names = c("sample", "data1", "data2", "datest", "dattestr",
"add"), row.names = c(NA, -3L), class = "data.frame"))
my_list
[[1]]
sample data1 data2 data3 data1namet data2namab dataame ma
1 2 56 59 90 Sam1 Test2 Ex3 Jay
2 6 78 27 28 Sam1 Test2 Ex3 Jay
[[2]]
sample data1 data2 datest dattestr add
1 12 56 59 Exa9 cz1 2
2 13 78 27 Exa9 cz1 2
3 17 3 2 Exa9 cz1 2
I've got two problems:
I would like to extract columns in this list based on patterns of their column names, e.g. all columns which contain the word 'data' in their column name. I wasn't able to find a solution with grep.
I know how to extract one column based on their index number (see example below), but how could I do this selection directly based on the column name (not the column number)?
out <- lapply(my_list, `[`, 1) # extract "sample" column
Try
lapply(my_list, function(df) df[, grep("data", names(df), fixed = TRUE)] )
# [[1]]
# data1 data2 data3 data1namet data2namab dataame
# 1 56 59 90 Sam1 Test2 Ex3
# 2 78 27 28 Sam1 Test2 Ex3
#
# [[2]]
# data1 data2
# 1 56 59
# 2 78 27
# 3 3 2
lapply(my_list, "[", "sample")
# [[1]]
# sample
# 1 2
# 2 6
#
# [[2]]
# sample
# 1 12
# 2 13
# 3 17

Resources