Apply dplyr function to all but one column - r

Given a data frame with numeric values in all columns except for the last one, how can I compute the mean across the row?
In this example, I am using all columns, including the name column which I would like to omit.
df <- as.data.frame(matrix(1:40, ncol=10)) %>%
mutate(name=LETTERS[1:4]) %>%
mutate(mean=rowMeans(.))
Desired data frame output:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 mean name
1 1 5 9 13 17 21 25 29 33 37 19 A
2 2 6 10 14 18 22 26 30 34 38 20 B
3 3 7 11 15 19 23 27 31 35 39 21 C
4 4 8 12 16 20 24 28 32 36 40 22 D

You could try:
df %>%
mutate(mean = select(., -matches("name")) %>% rowMeans(.))

In your setting, you could use
df <- as.data.frame(matrix(1:40, ncol=10)) %>%
mutate(name=LETTERS[1:4]) %>%
mutate(mean=rowMeans(.[,1:10]))

Related

How to filter DataFrame columns by value keeping original column order?

I'm trying to filter a DataFrame, keeping only columns containing "_time" or "___" in their column names.
I tried using df %>% select(contains(c("_time", "___")). However, this changes the order of the columns in the output, where all columns with _time are displayed first and the columns with "___" are displayed last.
How can filtering be done without changing the column order?
We can use matches
library(dplyr)
df %>%
select(matches("_time|___"))
-output
h_time l_time f___d m_time s___hello
1 11 16 21 26 31
2 12 17 22 27 32
3 13 18 23 28 33
4 14 19 24 29 34
5 15 20 25 30 35
compared to
df %>%
select(contains(c("_time", "___")))
h_time l_time m_time f___d s___hello
1 11 16 26 21 31
2 12 17 27 22 32
3 13 18 28 23 33
4 14 19 29 24 34
5 15 20 30 25 35
data
df <- data.frame(col1 = 1:5, col2 = 6:10, h_time = 11:15,
l_time = 16:20, f___d = 21:25, m_time = 26:30,
col_new = 41:45, s___hello = 31:35)
Base R: Data from #akrun (many thanks)
df[,grepl("_time|___", colnames(df))]
h_time l_time f___d m_time s___hello
1 11 16 21 26 31
2 12 17 22 27 32
3 13 18 23 28 33
4 14 19 24 29 34
5 15 20 25 30 35

Split multiple columns with delimiter and have consistent column names in r

I have a large data set and here is the sample (the raw data has more columns
dta0 = data.frame(cbind(paste(seq(10,15),seq(20,25),sep = ";"),
paste(seq(30,35),seq(40,45),sep = ";") ) )
colnames(dta0) = c("H1","H2")
Here is my desired output
desired_dta = data.frame(cbind(seq(10,15),seq(20,25),seq(30,35),seq(40,45)))
colnames(desired_dta) = c("H1_x","H1_y","H2_x","H2_y")
How can I name columns like "H1_x", "H1_y","H2_x", "H2_y" ....?
You could try
library(tidyr)
dta0 %>%
separate(H1, c("H1_x", "H1_y"), ";") %>%
separate(H2, c("H2_x", "H2_y"), ";")
#> H1_x H1_y H2_x H2_y
#> 1 10 20 30 40
#> 2 11 21 31 41
#> 3 12 22 32 42
#> 4 13 23 33 43
#> 5 14 24 34 44
#> 6 15 25 35 45
Or in base R
setNames(as.data.frame(do.call(cbind, lapply(dta0,
function(x) do.call(rbind, strsplit(x, ";"))))),
unlist(lapply(names(dta0), paste0, c("_x", "_y"))))
#> H1_x H1_y H2_x H2_y
#> 1 10 20 30 40
#> 2 11 21 31 41
#> 3 12 22 32 42
#> 4 13 23 33 43
#> 5 14 24 34 44
#> 6 15 25 35 45
Here is an option with cSplit
library(splitstackshape)
cSplit(dta0, names(dta0), sep=";")
# H1_1 H1_2 H2_1 H2_2
#1: 10 20 30 40
#2: 11 21 31 41
#3: 12 22 32 42
#4: 13 23 33 43
#5: 14 24 34 44
#6: 15 25 35 45
Another option is using separate_rows() then reshape to long and after that reshape to wide. Here the code:
library(tidyverse)
#Code
dta0 %>% mutate(id=1:n()) %>%
separate_rows(c(H1,H2),sep = ';') %>%
group_by(id) %>% mutate(Var=1:n()) %>%
pivot_longer(-c(id,Var)) %>%
mutate(Var=ifelse(Var==1,'x','y'),
name=paste0(name,'.',Var)) %>% select(-c(Var)) %>%
pivot_wider(names_from = name,values_from=value) %>% ungroup() %>%
select(-id) %>%
select(sort(current_vars())) %>%
mutate_each(funs = as.numeric)
Output:
# A tibble: 6 x 4
H1.x H1.y H2.x H2.y
<dbl> <dbl> <dbl> <dbl>
1 10 20 30 40
2 11 21 31 41
3 12 22 32 42
4 13 23 33 43
5 14 24 34 44
6 15 25 35 45
Assuming your data has an even number of columns:
nc = ncol(desired_dta)
colnames(desired_dta) = paste0("H",rep(1:(nc/2),each = 2),rep(c("_x","_y"),nc/2))

Loss of dimensions of dataframe after applying rowMeans() in R

I subset a dataframe and i applied rowMeans() on it but the dimensions of the resultant variable ('y') are lost and i am not able to use 'y' in my further code.
dim(mtcars)
# [1] 32 11
y = rowMeans((mtcars[,3:6]))
dim(y)
# NULL
Why 'y' is no longer a dataframe?. And what can i do to get back its dimensions?.
I tried the following but it didn't work.
as.data.frame(y)
# or
data.frame(y)
When you apply rowMeans() you are creating a vector out of a dataframe. So, you are going from n rows and k columns to a nx1 vector.
For a case with n=8 and k=5 we would have:
> a=as.data.frame(matrix(1:40,8,5))
> a
V1 V2 V3 V4 V5
1 1 9 17 25 33
2 2 10 18 26 34
3 3 11 19 27 35
4 4 12 20 28 36
5 5 13 21 29 37
6 6 14 22 30 38
7 7 15 23 31 39
8 8 16 24 32 40
> rowMeans(a)
[1] 17 18 19 20 21 22 23 24

All variables not read from data pipeline - dplyr

I have a dataset with 8 variables,when I run dplyr with syntax below, my output dataframe only has the variables I have used in the dplyr code, while I want all variables
ShowID<-MyData %>%
group_by(id) %>%
summarize (count=n()) %>%
filter(count==min(count))
ShowID
So my output will have two variables - ID and Count. How do I get rest of my variables in the new dataframe? Why is this happening, what am I clueless about here?
> ncol(ShowID)
[1] 2
> ncol(MyData)
[1] 8
MYDATA
key ID v1 v2 v3 v4 v5 v6
0-0-70cf97 1 89 20 30 45 55 65
3ad4893b8c 1 4 5 45 45 55 65
0-0-70cf97d7 2 848 20 52 66 56 56
0-0-70cf 2 54 4 846 65 5 5
0-0-793b8c 3 56454 28 6 4 5 65
0-0-70cf98 2 8 4654 30 65 6 21
3ad4893b8c 2 89 66 518 156 16 65
0-0-70cf97d8 3 89 20 161 1 55 45465
0-0-70cf 5 89 79 48 45 55 456
0-0-793b8c 5 89 20 48 545 654 4
0-0-70cf99 6 9 20 30 45 55 65
DESIRED
key ID count v1 v2 v3 v4 v5 v6
0-0-70cf99 6 1 9 20 30 45 55 65
RESULT FROM CODE
ID count
6 1
You can use the base R ave method to calculate number of rows in each group (ID) and then select those group which has minimum rows.
num_rows <- ave(MyData$v1, MyData$ID, FUN = length)
MyData[which(num_rows == min(num_rows)), ]
# key ID v1 v2 v3 v4 v5 v6
#11 0-0-70cf99 6 9 20 30 45 55 65
You could also use which.min in this case to avoid one step however, in case of multiple minimum values it would fail hence, I have used which.
No need to summarize:
ShowID <- MyData %>%
group_by(id) %>%
mutate(count = n()) %>%
ungroup() %>%
filter(count == min(count))

Melt data frame row by row

How can I melt a data frame row by row?
I found a really similar question on the forum but I still can't solve my problem without a different id variable.
This is my data set:
V1 V2 V3 V4 V5
51 20 29 12 20
51 22 51 NA NA
51 14 NA NA NA
51 75 NA NA NA
And I want to melt it into:
V1 variable value
51 V2 20
51 V3 29
51 V4 12
51 V5 20
51 V2 22
51 V3 51
51 V2 14
51 V2 75
Currently my approach is melting it row by row with a for loop and then rbind them together.
library(reshape)
df <- read.table(text = "V1 V2 V3 V4 V5 51 20 29 12 20 51 22 51 NA NA 51
+14 NA NA NA 51 75 NA NA NA", header = TRUE)
dfall<-NULL
for (i in 1:NROW(df))
{
dfmelt<-melt(df,id="V1",na.rm=TRUE)
dfall<-rbind(dfall,dfmelt)
}
Just wondering if there is any way to do this faster? Thanks!
We replicate the first column "V1" and the names of the dataset except the first column name to create the first and second column of the expected output, while the 'value' column is created by transposing the dataset without the first column.
na.omit(data.frame(V1=df1[1][col(df1[-1])],
variable = names(df1)[-1][row(df1[-1])],
value = c(t(df1[-1]))))
# V1 variable value
#1 51 V2 20
#2 51 V3 29
#3 51 V4 12
#4 51 V5 20
#5 51 V2 22
#6 51 V3 51
#9 51 V2 14
#13 51 V2 75
NOTE: No additional packages used.
Or we can use gather (from tidyr) to convert the 'wide' to 'long' format after we create a row id column (add_rownames from dplyr) and then arrange the rows.
library(dplyr)
library(tidyr)
add_rownames(df1) %>%
gather(variable, value, V2:V5, na.rm=TRUE) %>%
arrange(rowname, V1) %>%
select(-rowname)
# V1 variable value
# (int) (chr) (int)
#1 51 V2 20
#2 51 V3 29
#3 51 V4 12
#4 51 V5 20
#5 51 V2 22
#6 51 V3 51
#7 51 V2 14
#8 51 V2 75
Or with data.table
library(data.table)
melt(setDT(df1, keep.rownames=TRUE),
id.var= c("rn", "V1"), na.rm=TRUE)[
order(rn, V1)][, rn:= NULL][]
You can make a column with a unique ID for each row, so you can sort on it after melting. Using dplyr:
library(reshape2)
library(dplyr)
df %>% mutate(id = seq_len(n())) %>%
melt(id.var = c('V1','id'), na.rm = T) %>%
arrange(V1, id, variable) %>%
select(-id)
# V1 variable value
# 1 51 V2 20
# 2 51 V3 29
# 3 51 V4 12
# 4 51 V5 20
# 5 51 V2 22
# 6 51 V3 51
# 7 51 V2 14
# 8 51 V2 75
...or base R:
library(reshape2)
df$id <- seq_along(df$V1)
df2 <- melt(df, id.var = c('V1', 'id'), na.rm = TRUE)
df2[order(df2$V1, df2$id, df2$variable),-2]

Resources