Merge (make mean) columns with partially matched header name

Merge (make mean) columns with partially matched header name - r

I have a data which look like:
AAA_1 AAA_2 AAA_3 BBB_1 BBB_2 BBB_3 CCC
1 1 1 1 2 2 2 1
2 3 1 4 0 0 0 0
3 5 3 0 1 1 1 1
For each row, I want to make a mean for those columns which have a common feature as follow
feature <- c("AAA","BBB","CCC")
the desired output should look like:
AAA BBB CCC
1 1 2 1
2 2.6 0 0
3 2.6 1 1
for each pattern separately I was able to do that:
data <- read.table("data.txt",header=T,row.name=1)
AAA <- as.matrix(rowMeans(data[ , grepl("AAA" , names( data ) ) ])
But I did not know how to do partially match for different patterns in one row
Also tried some other things like :
for (i in 1:length(features)){
feature[i] <- as.matrix(rowMeans(data[ , grepl(feature[i] , names( data ) ) ]))
}

Assuming your colnames are always structured as shown in your example, then you can split the names and aggregate.
new_names <- unlist(strsplit(names(df),"\\_.*"))
colnames(df) <- new_names
#Testing with your data, we need to prevent the loss of dimension by using drop = FALSE
sapply(unique(new_names), function(i) rowMeans(df[, new_names==i, drop = FALSE]))
# AAA BBB CCC
#[1,] 1.000000 2 1
#[2,] 2.666667 0 0
#[3,] 2.666667 1 1
Data:
df <- structure(list(AAA_1 = c(1L, 3L, 5L), AAA_2 = c(1L, 1L, 3L),
AAA_3 = c(1L, 4L, 0L), BBB_1 = c(2L, 0L, 1L), BBB_2 = c(2L,
0L, 1L), BBB_3 = c(2L, 0L, 1L), CCC = c(1L, 0L, 1L)), .Names = c("AAA_1",
"AAA_2", "AAA_3", "BBB_1", "BBB_2", "BBB_3", "CCC"), class = "data.frame", row.names = c(NA,
-3L))

Here is another option for you. Seeing your column pattern, I chose to use gsub() and get the first three letters. Using ind which includes AAA, BBB, and CCC, I used lapply(), subsetted the data for each element of ind, calculated row means, and extracted a column for row mean only. Then, I used bind_cols() and created foo. The last thing was to assign column names to foo.
library(dplyr)
ind <- unique(gsub("_\\d+$", "", names(mydf)))
lapply(ind, function(x){
select(mydf, contains(x)) %>%
transmute(out = rowMeans(.))
}) %>%
bind_cols() %>%
add_rownames -> foo
names(foo) <- ind
# AAA BBB CCC
# (dbl) (dbl) (dbl)
#1 1.000000 2 1
#2 2.666667 0 0
#3 2.666667 1 1
DATA
mydf <- structure(list(AAA_1 = c(1L, 3L, 5L), AAA_2 = c(1L, 1L, 3L),
AAA_3 = c(1L, 4L, 0L), BBB_1 = c(2L, 0L, 1L), BBB_2 = c(2L,
0L, 1L), BBB_3 = c(2L, 0L, 1L), CCC = c(1L, 0L, 1L)), .Names = c("AAA_1",
"AAA_2", "AAA_3", "BBB_1", "BBB_2", "BBB_3", "CCC"), class = "data.frame", row.names = c(NA,
-3L))

library(dplyr)
library(tidyr)
data %>%
add_rownames() %>%
gather("variable", "value", -rowname) %>%
mutate(variable = gsub("_.*$", "", variable)) %>%
group_by(rowname, variable) %>%
summarise(mean = mean(value)) %>%
spread(variable, mean)

Related

Subtract above number that is not NA

I have a table similar to this minimal example without the difference column:
trigger
values
difference
0
3
0
NA
0
NA
1
5
2
0
4
0
NA
1
10
6
I want to subtract the above number (and leave out the NAs) from the number at each trigger point (trigger = 1)
Is there a way to do this in R?
Edit:
I have now the situation where the triggers lie close together like in this example:
trigger
values
difference
0
3
0
NA
0
NA
1
5
2
0
4
1
5
1
0
10
How can I tackle this problem?

Create a grouping column with cumsum on the 'trigger' and taking the lag, then do the difference between the first and last element and replace it as the last value per group
library(dplyr)
df1 %>%
group_by(grp = lag(cumsum(trigger), default = 0)) %>%
mutate(difference = replace(rep(NA, n()), n(),
values[n()] - values[1])) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 7 × 3
trigger values difference
<int> <int> <int>
1 0 3 NA
2 0 NA NA
3 0 NA NA
4 1 5 2
5 0 4 NA
6 0 NA NA
7 1 10 6
For the second case, we may need a condition with if/else that checks the number of rows i.e. if the number of rows is greater than 1 only need the computation to replace
df2 %>%
group_by(grp = lag(cumsum(trigger), default = 0)) %>%
mutate(difference = if(n() > 1) replace(rep(NA, n()), n(),
values[n()] - values[1]) else NA) %>%
ungroup
-output
# A tibble: 7 × 4
trigger values grp difference
<int> <int> <dbl> <int>
1 0 3 0 NA
2 0 NA 0 NA
3 0 NA 0 NA
4 1 5 0 2
5 0 4 1 NA
6 1 5 1 1
7 0 10 2 NA
data
df1 <- structure(list(trigger = c(0L, 0L, 0L, 1L, 0L, 0L, 1L), values = c(3L,
NA, NA, 5L, 4L, NA, 10L)), class = "data.frame", row.names = c(NA,
-7L))
df2 <- structure(list(trigger = c(0L, 0L, 0L, 1L, 0L, 1L, 0L), values = c(3L,
NA, NA, 5L, 4L, 5L, 10L)), class = "data.frame", row.names = c(NA,
-7L))

# Import data: df => data.frame
df <- structure(list(trigger = c(0L, 0L, 0L, 1L, 0L, 0L, 1L), values = c(3L,
NA, NA, 5L, 4L, NA, 10L), diff_col = c(NA, NA, NA, 2L, -1L, NA,
6L)), row.names = c(NA, -7L), class = "data.frame")
# Create an empty vector: diff_col => integer vector
df$diff_col <- NA_integer_
# Difference the X.values vector, ignoring NAs:
# diff_col => integer vector
df[which(!(is.na(df$values)))[-1], "diff_col"] <- diff(
na.omit(
df$values
)
)
# Nullify the value if the trigger is 0:
# diff_col => integer vector
df$diff_col <- with(
df,
ifelse(
trigger == 0,
NA_integer_,
diff_col
)
)

Transform dataframe (crosstab by counts)

I have a df (coming from a csv) that has this structure
id att1_beer att1_wine att2_beer att2_wine
1 1 1 0 0
2 0 1 0 1
3 1 1 0 1
4 0 1 0 1
5 1 1 0 0
I would like to get a table (preferably with Tidyverse) to this format:
Beer Wine
Att1 3 5
Att2 0 3
Is this possible? I'm trying to avoid exporting to Excel to do it.

Can reshape to 'long' with pivot_longer and then get the sum by group
library(dplyr)
df %>%
select(-id) %>%
pivot_longer(cols =everything(), names_sep="_",
names_to = c("grp", ".value")) %>%
group_by(grp) %>%
summarise(across(everything(), sum), .groups = 'drop')
Or using base R
sapply(split.default(df[-1], sub(".*_", "", names(df)[-1])), colSums)
data
df <- structure(list(id = 1:5, att1_beer = c(1L, 0L, 1L, 0L, 1L),
att1_wine = c(1L,
1L, 1L, 1L, 1L), att2_beer = c(0L, 0L, 0L, 0L, 0L), att2_wine = c(0L,
1L, 1L, 1L, 0L)), class = "data.frame", row.names = c(NA, -5L
))

data.table solution for completeness' sake
library( data.table ); setDT(df) #prepare
ans <- melt( df, id.vars = "id" ) #melt to long format
ans[, c("att", "drink") := tstrsplit( variable, "_" )] #split column to variables
dcast(ans, att ~ drink, fun.aggregate = sum ) #cast to wide and sum

A base R option using xtabs + colSums
u <- colSums(df[-1])
xtabs(
u ~ .,
data.frame(
u,
do.call(rbind, strsplit(names(u), "_"))
)
)
gives
X2
X1 beer wine
att1 3 5
att2 0 3

Count values above 0 and count how many match a pattern in a row (in R)

I would like to count how many rows in each column are >0 and how many of those rows (that are >0) start with "mt-".
The result should also be in a data frame.
Here is an example.
df1
mt-abc 1 0 2
mt-dca 1 1 2
cla 0 2 0
dla 0 3 0
result
above0 2 3 2
mt 2 1 2

In base R you can do :
mat <- df[-1] > 0
rbind(above0 = colSums(mat),
mt = colSums(startsWith(df$V1, 'mt') & mat))
# V2 V3 V4
#above0 2 3 2
#mt 2 1 2
Actual data has numbers in the column and names in rownames for which we can do :
mat <- df > 0
rbind(above0 = colSums(mat),
mt = colSums(startsWith(rownames(df), 'mt') & mat))
data
df <- structure(list(V1 = c("mt-abc", "mt-dca", "cla", "dla"), V2 = c(1L,
1L, 0L, 0L), V3 = 0:3, V4 = c(2L, 2L, 0L, 0L)), class = "data.frame",
row.names = c(NA, -4L))

I don't think this is the most elegant approach in the tidyverse, but just out of curiosity:
library(tidyverse)
my_df <- data.frame(
stringsAsFactors = FALSE,
var = c("mt-abc", "mt-dca", "cla", "dla"),
x = c(1L, 1L, 0L, 0L),
y = c(0L, 1L, 2L, 3L),
z = c(2L, 2L, 0L, 0L)
)
df_1 <- my_df %>%
summarize(across(.cols=x:z, .fn=~sum(.x > 0))) %>%
mutate(var="above0")
df_2 <- my_df %>%
filter(str_detect(var, "^mt")) %>%
summarise(across(.cols=x:z, .fn=~sum(.x > 0))) %>%
mutate(var="mt")
bind_rows(df_1, df_2)
#> x y z var
#> 1 2 3 2 above0
#> 2 2 1 2 mt
Created on 2020-12-04 by the reprex package (v0.3.0)

How to combine transmute with grep function?

I'm trying to find a way to create a new table with variables using the rowSums() function from an existing dataframe. For example, my existing dataframe is called 'asn' and I want to sum up the values for each row of all variables which contain "2011" in the variable title. I want a new table consisting of just one column called asn_y2011 which contains the sum of each row using the variables containing "2011"
Data
structure(list(row = 1:3, south_2010 = c(1L, 5L, 7L), south_2011 = c(4L,
0L, 4L), south_2012 = c(5L, 8L, 6L), north_2010 = c(3L, 4L, 1L
), north_2011 = c(2L, 6L, 0L), north_2012 = c(1L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-3L))
The existing 'asn' dataframe looks like this
row south_2010 south_2011 south_2012 north_2010 north_2011 north_2012
1 1 4 5 3 2 1
2 5 0 8 4 6 1
3 7 4 6 1 0 2
I'm trying to use the following function:
asn %>%
transmute(asn_y2011 = rowSums(, grep("2011")))
to get something like this
row asn_y2011
1 6
2 6
3 4

Continuing with your code, grep() should work like this:
library(dplyr)
asn %>%
transmute(row, asn_y2011 = rowSums(.[grep("2011", names(.))]))
# row asn_y2011
# 1 1 6
# 2 2 6
# 3 3 4
Or you can use tidy selection in c_across():
asn %>%
rowwise() %>%
transmute(row, asn_y2011 = sum(c_across(contains("2011")))) %>%
ungroup()

Another base R option using rowSums
cbind(asn[1],asn_y2011 = rowSums(asn[grep("2011",names(asn))]))
which gives
row asn_y2011
1 1 6
2 2 6
3 3 4

An option in base R with Reduce
cbind(df['row'], asn_y2011 = Reduce(`+`, df[endsWith(names(df), '2011')]))
# row asn_y2011
#1 1 6
#2 2 6
#3 3 4
data
df <- structure(list(row = 1:3, south_2010 = c(1L, 5L, 7L), south_2011 = c(4L,
0L, 4L), south_2012 = c(5L, 8L, 6L), north_2010 = c(3L, 4L, 1L
), north_2011 = c(2L, 6L, 0L), north_2012 = c(1L, 1L, 2L)),
class = "data.frame", row.names = c(NA,
-3L))

I think that this code will do what you want:
library(magrittr)
tibble::tibble(row = 1:3, south_2011 = c(4, 0, 4), north_2011 = c(2, 6, 0)) %>%
tidyr::gather(- row, key = "key", value = "value") %>%
dplyr::mutate(year = purrr::map_chr(.x = key, .f = function(x)stringr::str_split(x, pattern = "_")[[1]][2])) %>%
dplyr::group_by(row, year) %>%
dplyr::summarise(sum(value))
I first load the package magrittr so that I can use the pipe, %>%. I've explicitly listed the packages from which the functions are exported, but you are welcome to load the packages with library if you like.
I then create a tibble, or data frame, like what you specify.
I use gather to reorganize the data frame before creating a new variable, year. I then summarise the counts by value of row and year.

You can try this approach
library(tidyverse)
df2 <- df %>%
select(grep("_2011|row", names(df), value = TRUE)) %>%
rowwise() %>%
mutate(asn_y2011 = sum(c_across(south_2011:north_2011))) %>%
select(row, asn_y2011)
# row asn_y2011
# <int> <int>
# 1 1 6
# 2 2 6
# 3 3 4
Data
df <- structure(list(row = 1:3, south_2010 = c(1L, 5L, 7L), south_2011 = c(4L, 0L, 4L), south_2012 = c(5L, 8L, 6L), north_2010 = c(3L, 4L, 1L), north_2011 = c(2L, 6L, 0L), north_2012 = c(1L, 1L, 2L)), class = "data.frame", row.names = c(NA,-3L))

Finding the max number of occurrences from the available result

I have a dataframe which looks like -
Id Result
A 1
B 2
C 1
B 1
C 1
A 2
B 1
B 2
C 1
A 1
B 2
Now I need to calculate how many 1's and 2's are there for each Id and then select the number whose frequency of occurrence is the greatest.
Id Result
A 1
B 2
C 1
How can I do that? I have tried using the table function in some way but not able to use it effectively. Any help would be appreciated.

Here you can use aggregate in one step:
df <- structure(list(Id = structure(c(1L, 2L, 3L, 2L, 3L, 1L, 2L, 2L,
3L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"),
Result = c(1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L)),
.Names = c("Id", "Result"), class = "data.frame", row.names = c(NA, -11L)
)
res <- aggregate(Result ~ Id, df, FUN=function(x){which.max(c(sum(x==1), sum(x==2)))})
res
Result:
Id Result
1 A 1
2 B 2
3 C 1

With data.table you can try (df is your data.frame):
require(data.table)
dt<-as.data.table(df)
dt[,list(times=.N),by=list(Id,Result)][,list(Result=Result[which.max(times)]),by=Id]
# Id Result
#1: A 1
#2: B 2
#3: C 1

Using dplyr, you can try
library(dplyr)
df %>% group_by(Id, Result) %>% summarize(n = n()) %>% group_by(Id) %>%
filter(n == max(n)) %>% summarize(Result = Result)
Id Result
1 A 1
2 B 2
3 C 1

An option using table and ave
subset(as.data.frame(table(df1)),ave(Freq, Id, FUN=max)==Freq, select=-3)
# Id Result
# 1 A 1
# 3 C 1
# 5 B 2

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Merge (make mean) columns with partially matched header name - r

library(dplyr) library(tidyr) data %>% add_rownames() %>% gather("variable", "value", -rowname) %>% mutate(variable = gsub("_.*$", "", variable)) %>% group_by(rowname, variable) %>% summarise(mean = mean(value)) %>% spread(variable, mean)

Related

Subtract above number that is not NA

Transform dataframe (crosstab by counts)

Count values above 0 and count how many match a pattern in a row (in R)

How to combine transmute with grep function?

Finding the max number of occurrences from the available result

Categories

Resources