I have an example data like below.
site <- c("a", "b")
RankA <- c("3","1")
RankB <- c("1","3")
RankC <- c("0","1")
rawdata <- cbind(site, RankA, RankB, RankC)
I would like to transform like "newdata".
site <- c("a","a","a","a", "b","b","b","b","b")
Rank <- c("A","A","A","B","A","B","B","B","C")
newdata<- cbind(site,Rank)
Thanks,
#edit
rawdata is the result of an evaluation survey about the site. For each site, the number of evaluations at each rank (A to C) is recorded. For example,"site a" has 3 votes for RankA, 1 vote for RankB, and 0 votes for RankC. I want to convert this data into "newdata" where each evaluation is one row.
Try this, using dplyr and tidyr:
site <- c("a", "b")
RankA <- c("3","1")
RankB <- c("1","3")
RankC <- c("0","1")
df <- data.frame(site, A = RankA, B = RankB, C = RankC)
df <- tidyr::pivot_longer(df, cols = 2:4, values_to = 'rep', names_to = 'rank')
df <- df[rep(1:nrow(df), df$rep),] %>%
select(-rep)
df
You can use uncount from tidyr -
library(dplyr)
library(tidyr)
rawdata <- data.frame(site = c("a", "b"), RankA = c(3,1),
RankB = c(1,3), RankC = c(0,1))
rawdata %>% pivot_longer(cols = -site) %>% uncount(value)
# site name
# <chr> <chr>
#1 a RankA
#2 a RankA
#3 a RankA
#4 a RankB
#5 b RankA
#6 b RankB
#7 b RankB
#8 b RankB
#9 b RankC
Related
I have two dataframes, one of which contains a subset of IDs and columns of the other (but has different values).
ds1 <- data.frame(id = c(1:4),
d1 = "A",
d2 = "B",
d3 = "C")
ds2 <- data.frame(id = c(1,2),
d1 = "W",
d2 = "X")
I am hoping to use dplyr on d1 to find the shared columns, and replace their values with those found in d2, matching on ID. I can mutate them one at a time like this:
ds1 %>%
mutate(d1 = ifelse(id %in% ds2$id, ds2$d1[ds2$id==id],d1),
d2 = ifelse(id %in% ds2$id, ds2$d2[ds2$id==id],d2))
In my real situation, I am needing to do this 47 times, however. With the robustness of across(), I feel there is a better way. I am open to non-dplyr solutions as well.
You may perhaps need this using dplyr and stringr (can be done without stringr also)
library(tidyverse)
ds1 %>% left_join(ds2, by = 'id') %>%
mutate(across(ends_with('.y'), ~ coalesce(., get(str_replace(cur_column(), '.y', '.x'))))) %>%
select(!ends_with('.x')) %>%
rename_with(~str_remove(., '.y'), ends_with('.y'))
#> id d3 d1 d2
#> 1 1 C W X
#> 2 2 C W X
#> 3 3 C A B
#> 4 4 C A B
Created on 2021-05-10 by the reprex package (v2.0.0)
using rows_update
library(tidyverse)
ds1 <- data.frame(id = c(1:4),
d1 = "A",
d2 = "B",
d3 = "C")
ds2 <- data.frame(id = c(1,2),
d1 = "W",
d2 = "X")
rows_update(x = ds1, y = ds2, by = "id")
#> id d1 d2 d3
#> 1 1 W X C
#> 2 2 W X C
#> 3 3 A B C
#> 4 4 A B C
Created on 2021-05-11 by the reprex package (v2.0.0)
This is somewhat similar to the one posted by my friend dear #AnilGoyal and also a little bit verbose comparing to yours you can use it for larger data sets:
library(dplyr)
library(stringr)
ds1 %>%
left_join(ds2, by = "id") %>%
mutate(across(ends_with(".x"), ~ ifelse(!is.na(get(str_replace(cur_column(), ".x", ".y"))),
get(str_replace(cur_column(), ".x", ".y")),
.x))) %>%
select(!ends_with(".y")) %>%
rename_with(~ str_remove(., ".x"), ends_with(".x"))
id d1 d2 d3
1 1 W X C
2 2 W X C
3 3 A B C
4 4 A B C
I have a dataset of the form:
Var1 Freq
A 16
B 15
C 11
D 11
E 2
F 1
My goal is to get an OUTPUT of the following form:
cat1 cat2 cat3 cat4 cat5
A B C,D E F
16 15 11 2 1
where cat1, ..., cat5 are the name of variables. I appreciate for your help in advance!
with(aggregate(Var1 ~ Freq, df, paste, collapse = ","),
setNames(rbind.data.frame(Var1, Freq)[, order(Var1)], paste0("cat", seq(Freq))))
cat1 cat2 cat3 cat4 cat5
1 A B C,D E F
2 16 15 11 2 1
Try this out
library(tidyverse)
df <- tribble(~Var1, ~Freq,
"A", 16,
"B", 15,
"C", 11,
"D", 11,
"E", 2,
"F", 1) %>%
group_by(Freq) %>%
summarise(Var1 = paste(Var1, collapse = ",")) %>%
arrange(Var1) %>%
as.matrix() %>%
t() %>% as_tibble(.name_repair = "universal") %>%
mutate_all(~str_trim(.)) %>%
arrange(desc(...1))
colnames(df) <- paste0("cat", 1:length(df))
# considering your data is in a data.frame called df
# let's create it
var1 <- LETTERS[1:6]
Freq <- c(16, 15, 11, 11, 2, 1)
df <- data.frame(var1, Freq, stringsAsFactors = FALSE)
# function to join var1
join <- function(x) {
index <- which(df$Freq == x)
paste(df$var1[index], collapse = ', ')
}
# get unique Freq and its length
unique_freq <- unique(df$Freq)
l <- length(unique_freq)
# create summarised var1
summarised_var <- rep("", l)
for (i in 1:l) {
summarised_var[i] <- join(unique_freq[i])
}
# create grouped data.frame
grouped_df <- data.frame(summarised_var, unique_freq, stringsAsFactors = FALSE)
# create a transposed data.frame to get rows into columns
transposed_df <- t(grouped_df)
# create columns names (variables names)
col_names <- paste0('cat', 1:nrow(grouped_df))
# rename columns
colnames(transposed_df) <- col_names
# transposed_df is your output
I have a table df that looks like this:
a <- c(10,20, 20, 20, 30)
b <- c("u", "u", "u", "r", "r")
c <- c("a", "a", "b", "b", "b")
df <- data.frame(a,b,c)
I would like to create a new table that contains the mean of col a, grouped by variable c. And I would like to have a column with the counts of the occurrence of b types within each group c.
I would therefore like the result table to look like df2:
a_m <- c(15, 23.3)
c <- c("a", "b")
counts_b <-c("2 u", "1 u, 2 r")
df2 <- data.frame(a_m, c, counts_b)
What I have so far is:
df2 <- df %>% group_by(c) %>% summarise(a_m = mean(a, na.rm = TRUE))
I do not know how to add the column counts_b in the example df2.
Giulia
Here's a way using a little table magic:
df %>%
group_by(c) %>%
summarise(a_mean = mean(a),
b_list = paste(names(table(b)), table(b), collapse = ', '))
# A tibble: 2 x 3
c a_mean b_list
<fct> <dbl> <chr>
1 a 15.0 r 0, u 2
2 b 23.3 r 2, u 1
Here is another solution using reshape2. The output format may be more convenient to work with, each value of b has its own column with the number of occurrences.
out1 <- dcast(df, c ~ b, value.var="c", fun.aggregate=length)
c r u
1 a 0 2
2 b 2 1
out2 <- df %>% group_by(c) %>% summarise(a_m = mean(a))
# A tibble: 2 x 2
c a_m
<fctr> <dbl>
1 a 15.00000
2 b 23.33333
df2 <- merge(out1, out2, by=c)
c r u a_m
1 a 0 2 15.00000
2 b 2 1 23.33333
# Data1
SampleID <- c("A-01","B-01","C-01")
Value <- c(1,2,3)
data1 <- data.frame(SampleID, Value)
# Data2
SampleID <- c("A","B","C")
Value1 <- c(3,4,5)
data2 <- data.frame(SampleID,Value1)
# Output : What I want is the following using:
merge(data1, data2, by=c("SampleID"), all = TRUE)
SampleID Value Value1
A-01 1 3
B-01 2 4
C-01 3 5
You can first split SampleID from data1 and then concatenate it.
SampleID <- c("A-01","B-01","C-01")
Sample <- substr(SampleID,1,1)
Num <- substr(SampleID,3,5)
Value <- c(1,2,3)
data1 <- data.frame(Sample ,Num, Value )
SampleID <- c("A","B","C")
Value1 <- c(3,4,5)
data2 <- data.frame(SampleID, Value1)
merged <- merge(data1, data2, by.x = "Sample", by.y = "SampleID", all = T )
merged$SampleID <- paste(merged$Sample,merged$Num, sep = "-")
merged <- merged[,c(5,3,4)]
SampleID Value Value1
1 A-01 1 3
2 B-01 2 4
3 C-01 3 5
I believe the following does what you need.
data1$NewID <- gsub("[^[:alpha:]]", "", data1$SampleID)
result <- merge(data1, data2, by.x = "NewID", by.y = "SampleID", all = TRUE)
result <- result[-1]
result
# SampleID Value Value1
#1 A-01 1 3
#2 B-01 2 4
#3 C-01 3 5
You can then remove the extra column from data1 with
data1 <- data1[-3]
You can do it using sqldf library:
library(sqldf);
sqldf("SELECT data1.SampledId, data1.Vlaue, data2.Value2 FROM data1 JOIN data2 on data1.SampleID like data1.SampleID + '-%'")
Or using data.table likes the following:
library(data.table)
dt1 <- data.table(data1)
dt2 <- data.table(data2)
dt1[dt2, on = .(grepl(CustomerId, CustomerId)), all = TRUE]
To add to collection, here is a dplyr solution which reads a bit easier:
options(stringsAsFactors = F)
SampleID <-c("A-01","B-01","C-01")
Value <- c(1,2,3)
data1 <- data.frame(SampleID, Value)
SampleID <- c("A","B","C")
Value1 <- c(3,4,5)
data2 <- data.frame(SampleID,Value1)
data1 %>%
mutate(new_id = gsub("[^[:alpha:]]", "", SampleID)) %>%
left_join(., data2, by = c("new_id" = "SampleID")) %>%
select(-new_id)
SampleID Value Value1
1 A-01 1 3
2 B-01 2 4
3 C-01 3 5
I would like to concatenate an arbitrary number of columns in a dataframe based on a variable cols_to_concat
df <- dplyr::data_frame(a = letters[1:3], b = letters[4:6], c = letters[7:9])
cols_to_concat = c("a", "b", "c")
To achieve the desired result with this specific value of cols_to_concat I could do this:
df %>%
dplyr::mutate(concat = paste0(a, b, c))
But I need to generalise this, using syntax a bit like this
# (DOES NOT WORK)
df %>%
dplyr::mutate(concat = paste0(cols))
I'd like to use the new NSE approach of dplyr 0.7.0, if this is appropriate, but can't figure out the correct syntax.
You can perform this operation using only the tidyverse if you'd like to stick to those packages and principles. You can do it by using either mutate() or unite_(), which comes from the tidyr package.
Using mutate()
library(dplyr)
df <- tibble(a = letters[1:3], b = letters[4:6], c = letters[7:9])
cols_to_concat <- c("a", "b", "c")
df %>% mutate(new_col = do.call(paste0, .[cols_to_concat]))
# A tibble: 3 × 4
a b c new_col
<chr> <chr> <chr> <chr>
1 a d g adg
2 b e h beh
3 c f i cfi
Using unite_()
library(tidyr)
df %>% unite_(col='new_col', cols_to_concat, sep="", remove=FALSE)
# A tibble: 3 × 4
new_col a b c
* <chr> <chr> <chr> <chr>
1 adg a d g
2 beh b e h
3 cfi c f i
EDITED July 2020
As of dplyr 1.0.0, it appears that across() and c_across() are replacing the underscore verbs (e.g. unite_) and scoped variants like mutate_if(), mutate_at() and mutate_all(). Below is an example using that convention. Not the most concise, but still an option that promises to be more extensible.
Using c_across()
library(dplyr)
df <- tibble(a = letters[1:3], b = letters[4:6], c = letters[7:9])
cols_to_concat <- c("a", "b", "c")
df %>%
rowwise() %>%
mutate(new_col = paste0(c_across(all_of(cols_to_concat)), collapse=""))
#> # A tibble: 3 x 4
#> # Rowwise:
#> a b c new_col
#> <chr> <chr> <chr> <chr>
#> 1 a d g adg
#> 2 b e h beh
#> 3 c f i cfi
Created on 2020-07-08 by the reprex package (v0.3.0)
You can try syms from rlang:
library(dplyr)
packageVersion('dplyr')
#[1] ‘0.7.0’
df <- dplyr::data_frame(a = letters[1:3], b = letters[4:6], c = letters[7:9])
cols_to_concat = c("a", "b", "c")
library(rlang)
cols_quo <- syms(cols_to_concat)
df %>% mutate(concat = paste0(!!!cols_quo))
# or
df %>% mutate(concat = paste0(!!!syms(cols_to_concat)))
# # A tibble: 3 x 4
# a b c concat
# <chr> <chr> <chr> <chr>
# 1 a d g adg
# 2 b e h beh
# 3 c f i cfi
You can do the following:
library(dplyr)
df <- dplyr::data_frame(a = letters[1:3], b = letters[4:6], c = letters[7:9])
cols_to_concat = lapply(list("a", "b", "c"), as.name)
q <- quos(paste0(!!! cols_to_concat))
df %>%
dplyr::mutate(concat = !!! q)