I have a data frame(df):
group col
a 12
a 15
a 13
b 21
b 23
Desired output is also a data frame(df1):
col1 col2
12 21
15 23
13 0
Namley, I want to partition "col" of "df" by "group" into multi columns as "col1" and "col2".
When the length of each column is not equal to each other, "0" must be added end of each column untill the length of each column reaches to the maximum column length.
We could either use base R functions split or unstack to split the 'col' by 'group' into a list, then pad NA to list elements that are less than the maximum length of the list element. Change the column names, replace 'NA' by 0.
lst <- unstack(df1, col~group)
d1 <- as.data.frame(sapply(lst, `length<-`, max(sapply(lst, length))))
d1[is.na(d1)] <- 0
colnames(d1) <- paste0('col', 1:ncol(d1))
d1
# col1 col2
#1 12 21
#2 15 23
#3 13 0
Or use stri_list2matrix from stringi
library(stringi)
d1 <- as.data.frame(stri_list2matrix(unstack(df1, col~group),
fill=0), stringsAsFactors=FALSE)
d1[] <- lapply(d1, as.numeric)
Or using data.table/splitstackshape
library(splitstackshape)
setnames(dcast(getanID(df1, 'group'), .id~group, value.var='col',
fill=0L)[, .id:= NULL], paste0('col', 1:2))[]
# col1 col2
#1: 12 21
#2: 15 23
#3: 13 0
How to do it with dplyr...
library(dplyr)
library(tidyr)
df1 %>%
group_by(group) %>%
mutate(n = row_number()) %>%
spread(group, col) %>%
select(-n) %>%
(function(x) { x[is.na(x)] <- 0; x })
Since you fill with zeroes, another idea:
xtabs(col ~ ave(DF$col, DF$group, FUN = seq_along) + group, DF)
# group
#ave(DF$col, DF$group, FUN = seq_along) a b
# 1 12 21
# 2 15 23
# 3 13 0
Where "DF":
DF = structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), col = c(12L, 15L, 13L, 21L, 23L)), .Names = c("group",
"col"), class = "data.frame", row.names = c(NA, -5L))
Related
I have a dataframe with count information (df1)
rownames
sample1
sample2
sample3
m1
0
5
1
m2
1
7
5
m3
6
2
0
m4
3
1
0
and a second with sample information (df2)
rownames
batch
total count
sample1
a
10
sample2
b
15
sample3
a
6
I also have two lists with information about the m values (could easily be turned into another data frame if necessary but I would rather not add to the count information as it is quite large). No patterns (such as even and odd) exist, I am just using a very simplistic example
x <- c("m1", "m3") and y <- c("m2", "m4")
What I would like to do is add another two columns to the sample information. This is a count of each m per sample that has a value of above 5 and appears in list x or y
rownames
batch
total count
x
y
sample1
a
10
1
0
sample2
b
15
1
1
sample3
a
6
0
1
My current strategy is to make a list of values for both x and y and then append them to df2. Here are my attempts so far:
numX <- colSums(df1[sum(rownames(df1)>10 %in% x),]) and numX <- colSums(df1[sum(rownames(df1)>10 %in% x),]) both return a list of 0s
numX <- colSums(df1[rownames(df1)>10 %in% x,]) returns a list of the sum of count values meeting the conditions for each column
numX <- length(df1[rownames(df1)>10 %in% novel,]) returns the number of times the condition is met (in this example 2L)
I am not really sure how to approach this so I have just been throwing around attempts. I've tried looking for answers but maybe I am just struggling to find the proper wording.
We may do this with rowwise
library(dplyr)
df2 %>%
rowwise %>%
mutate(x = +(sum(df1[[rownames]][df1$rownames %in% x]) >= 5),
y = +(sum(df1[[rownames]][df1$rownames %in% y]) >= 5)) %>%
ungroup
-output
# A tibble: 3 × 5
rownames batch totalcount x y
<chr> <chr> <int> <int> <int>
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
Or based on the data, a base R option would be
out <- aggregate(. ~ grp, FUN = sum,
transform(df1, grp = c('x', 'y')[1 + (rownames %in% y)] )[-1])
df2[out$grp] <- +(t(out[-1]) >= 5)
-output
> df2
rownames batch totalcount x y
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
data
df1 <- structure(list(rownames = c("m1", "m2", "m3", "m4"), sample1 = c(0L,
1L, 6L, 3L), sample2 = c(5L, 7L, 2L, 1L), sample3 = c(1L, 5L,
0L, 0L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(rownames = c("sample1", "sample2", "sample3"),
batch = c("a", "b", "a"), totalcount = c(10L, 15L, 6L)),
class = "data.frame", row.names = c(NA,
-3L))
How about using using dplyr and reshape2::melt
df3 <- df1 %>%
melt %>%
filter(value >= 5) %>%
mutate(x = as.numeric(rownames %in% c("m1", "m3")),
y = as.numeric(rownames %in% c("m2", "m4"))) %>%
select(-rownames, - value) %>%
group_by(variable) %>%
summarise(x = sum(x), y = sum(y))
df2 %>% left_join(df3, by = c("rownames" = "variable"))
rownames batch total_count x y
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
You can create a named list of vectors and for each rownames count how many values of x and y in the respective sample is >= 5.
Base R option -
list_vec <- list(x = x, y = y)
cbind(df2, do.call(rbind, lapply(df2$rownames, function(x)
sapply(list_vec, function(y) {
sum(df1[[x]][df1$rownames %in% y] >= 5)
}))))
# rownames batch total.count x y
#1 sample1 a 10 1 0
#2 sample2 b 15 1 1
#3 sample3 a 6 0 1
Using tidyverse -
library(dplyr)
library(purrr)
list_vec <- lst(x, y)
df2 %>%
bind_cols(map_df(df2$rownames, function(x)
map(list_vec, ~sum(df1[[x]][df1$rownames %in% .x] >= 5))))
I have two different IDs for the same subject(patient).
In this other vector of IDs, the two IDs are both in there that indicate the same patient. How do I only count the patient once(by ID1), instead of two different patients with different IDs?
ID1 ID2
11 12
13 14
15 16
vector
11,12,13,13,14,16
I want to count only the unique patients by ID1, such that I would get
x=11,13,15
Thank you!
I think probably you need this
df %>% filter((ID1 %in% vector) | (ID2 %in% vector)) %>%
select(ID1)
ID1
1 11
2 13
3 15
Check it on a better sample
df <- structure(list(ID1 = c(11L, 13L, 15L, 17L, 19L, 21L), ID2 = c(12L,
14L, 16L, 18L, 20L, 22L)), class = "data.frame", row.names = c(NA,
-6L)
> df
ID1 ID2
1 11 12
2 13 14
3 15 16
4 17 18
5 19 20
6 21 22
vector <- c(11, 12, 13, 13, 14, 16, 18, 18)
> df %>% filter((ID1 %in% vector) | (ID2 %in% vector)) %>% select(ID1)
ID1
1 11
2 13
3 15
4 17
By slightly modifying Ronak's code, you can get same results
df %>%
mutate(ID = row_number()) %>%
tidyr::pivot_longer(cols = c(ID1, ID2)) %>%
inner_join(tibble::enframe(vector), by = 'value') %>%
distinct(ID, .keep_all = T) %>%
select(ID, value) %>%
inner_join(df %>% mutate(ID = row_number()), by = 'ID') %>%
select(ID1)
Create a unique ID number for each patient, get the data in long format so both the ID's are in same column, join it with the vector select vector values for distinct ID values.
library(dplyr)
df %>%
mutate(ID = row_number()) %>%
tidyr::pivot_longer(cols = c(ID1, ID2)) %>%
inner_join(tibble::enframe(vector), by = 'value') %>%
distinct(ID, .keep_all = TRUE) %>%
select(value)
# value
# <dbl>
#1 11
#2 13
#3 16
data
df <- structure(list(ID1 = c(11L, 13L, 15L), ID2 = c(12L, 14L, 16L)),
class = "data.frame", row.names = c(NA, -3L))
vector <- c(11, 12, 13, 13, 14, 16)
You can use any with %in% by selecting the rows with apply to subset ID1.
ID$ID1[apply(ID, 1, function(z) any(v %in% z))]
#[1] 11 13 15
or use rowSums.
ID$ID1[rowSums(sapply(ID, "%in%", v)) > 0]
#[1] 11 13 15
Data:
ID <- read.table(header=TRUE, text="ID1 ID2
11 12
13 14
15 16")
v <- c(11,12,13,13,14,16)
I am new to R. I am trying to search the columns using grep multiple times within an apply loop. I use grep to specify which rows are summed based on the vector individuals
individuals <-c("ID1","ID2".....n)
bcdata_total <- sapply(individuals, function(x) {
apply(bcdata_clean[,grep(individuals, colnames(bcdata_clean))], 1, sum)
})
bcdata is of random size and contains random data but contains columns that have individuals in part of the string
>head(bcdata)
ID1-4 ID1-3 ID2-5
A 3 2 1
B 2 2 3
C 4 5 5
grep(individuals[1],colnames(bcdata_clean)) returns a vector that looks like
[1] 1 2, a list of the column names containing ID1. That vector is used to select columns to be summed in bcdata_clean. This should occur n number of times depending on the length of individuals
However this returns the error
In grep(individuals, colnames(bcdata)) :
argument 'pattern' has length > 1 and only the first element will be used
And results in all the columns of bcdata being identical
Ideally individuals would increment each time the function is run like this for each iteration
apply(bcdata_clean[,grep(individuals[1,2....n], colnames(bcdata_clean))], 1, sum)
and would result in something like this
>head(bcdata_total)
ID1 ID2
A 5 1
B 4 3
C 9 5
But I'm not sure how to increment individuals. What is the best way to do this within the function?
You can use split.default to split data on similarly named columns and sum them row-wise.
sapply(split.default(df, sub('-.*', '', names(df))), rowSums, na.rm. = TRUE)
# ID1 ID2
#A 5 1
#B 4 3
#C 9 5
data
df <- structure(list(`ID1-4` = c(3L, 2L, 4L), `ID1-3` = c(2L, 2L, 5L
), `ID2-5` = c(1L, 3L, 5L)), class = "data.frame", row.names = c("A", "B", "C"))
Passing individuals as my argument in function(x) fixed my issue
bcdata_total <- sapply(individuals, function(individuals) {
apply(bcdata_clean[,grep(individuals, colnames(bcdata_clean))], 1, sum)
})
An option with tidyverse
library(dplyr)
library(tidyr)
library(tibble)
df %>%
rownames_to_column('rn') %>%
pivot_longer(cols = -rn, names_to = c(".value", "grp"), names_sep="-") %>%
group_by(rn) %>%
summarise(across(starts_with('ID'), sum, na.rm = TRUE), .groups = 'drop') %>%
column_to_rownames('rn')
# ID1 ID2
#A 5 1
#B 4 3
#C 9 5
data
df <- df <- structure(list(`ID1-4` = c(3L, 2L, 4L), `ID1-3` = c(2L, 2L, 5L
), `ID2-5` = c(1L, 3L, 5L)), class = "data.frame", row.names = c("A", "B", "C"))
I am trying to iterate through columns, and if the column is a whole year, it should be duplicated four times, and renamed to quarters
So this
2000 Q1-01 Q2-01 Q3-01
1 2 3 3
Should become this:
Q1-00 Q2-00 Q3-00 Q4-00 Q1-01 Q2-01 Q3-01
1 1 1 1 2 3 3
Any ideas?
We can use stringr::str_detect to look for colnames with 4 digits then take the last two digits from those columns
library(dplyr)
library(tidyr)
library(stringr)
df %>% gather(key,value) %>% group_by(key) %>%
mutate(key_new = ifelse(str_detect(key,'\\d{4}'),paste0('Q',1:4,'-',str_extract(key,'\\d{2}$'),collapse = ','),key)) %>%
ungroup() %>% select(-key) %>%
separate_rows(key_new,sep = ',') %>% spread(key_new,value)
PS: I hope you don't have a large dataset
Since you want repeated columns, you can just re-index your data frame and then update the column names
df <- structure(list(`2000` = 1L, Q1.01 = 2L, Q2.01 = 3L, Q3.01 = 3L,
`2002` = 1L, Q1.03 = 2L, Q2.03 = 3L, Q3.03 = 3L), row.names = c(NA,
-1L), class = "data.frame")
#> df
#2000 Q1.01 Q2.01 Q3.01 2002 Q1.03 Q2.03 Q3.03
#1 1 2 3 3 1 2 3 3
# Get indices of columns that consist of 4 numbers
col.ids <- grep('^[0-9]{4}$', names(df))
# For each of those, create new names, and for the rest preserve the old names
new.names <- lapply(seq_along(df), function(i) {
if (i %in% col.ids)
return(paste(substr(names(df)[i], 3, 4), c('Q1', 'Q2', 'Q3', 'Q4'), sep = '.'))
return(names(df)[i])
})
# Now repeat each of those columns 4 times
df <- df[rep(seq_along(df), ifelse(seq_along(df) %in% col.ids, 4, 1))]
# ...and finally set the column names to the desired new names
names(df) <- unlist(new.names)
#> df
#00.Q1 00.Q2 00.Q3 00.Q4 Q1.01 Q2.01 Q3.01 02.Q1 02.Q2 02.Q3 02.Q4 Q1.03 Q2.03 Q3.03
#1 1 1 1 1 2 3 3 1 1 1 1 2 3 3
I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.