Adding a pvalue column to dataframe in R - r

I have a dataframe that looks like this:
A1 A2 A3 B1 B2 B3
0 1 0 2 3 3
5 6 4 4 6 6
I would like to add a column based on t-testing the significance of the difference between As and Bs:
A1 A2 A3 B1 B2 B3 PValue
0 1 0 3 3 4 <some small number>
5 6 4 4 6 6 <some large number>
I tried using dplyr like this:
data %>%
mutate(PValue = t.test(unlist(c(A1,A2,A3),unlist(c(B1,B2,B3)))$p.value)
However, the resulting PValue column is constant for some reason. I would appreciate any help.

If we are doing this by row, then pmap is one way
library(tidyverse)
pmap_dbl(data, ~ c(...) %>%
{t.test(.[1:3], .[4:6])$p.value}) %>%
bind_cols(data, PValue = .)
# A1 A2 A3 B1 B2 B3 PValue
#1 0 1 0 2 3 3 0.007762603
#2 5 6 4 4 6 6 0.725030185
or another option is rowwise with do
data %>%
rowwise() %>%
do(data.frame(., PValue = t.test(unlist(.[1:3]), unlist(.[4:6]))$p.value))
# A tibble: 2 x 7
# A1 A2 A3 B1 B2 B3 PValue
#* <int> <int> <int> <int> <int> <int> <dbl>
#1 0 1 0 2 3 3 0.00776
#2 5 6 4 4 6 6 0.725
Or we can gather to 'long' format and then do the group by t.test
data %>%
rownames_to_column('rn') %>%
gather(key, val, -rn) %>% group_by(rn) %>%
summarise(PValue = t.test(val[str_detect(key, "A")],
val[str_detect(key, "B")])$p.value) %>%
pull(PValue) %>%
bind_cols(data, PValue = .)
data
data <- structure(list(A1 = c(0L, 5L), A2 = c(1L, 6L), A3 = c(0L, 4L),
B1 = c(2L, 4L), B2 = c(3L, 6L), B3 = c(3L, 6L)), .Names = c("A1",
"A2", "A3", "B1", "B2", "B3"), class = "data.frame", row.names = c(NA,
-2L))

Also with apply in Base R:
data$PValue = apply(data, 1, function(x) t.test(x[1:3], x[4:6])$p.value)
or:
library(dplyr)
data %>%
mutate(PValue = apply(., 1, function(x) t.test(x[1:3], x[4:6])$p.value))
Result:
A1 A2 A3 B1 B2 B3 PValue
1 0 1 0 2 3 3 0.007762603
2 5 6 4 4 6 6 0.725030185

Related

Melt a dataframe by different column names

I have a data frame of the following form:
a <- data.frame(list(X1=c("stn", "s1", "stn", "s2"),
X2=c("var1", "1", "var4", "2"),
X3=c("var2", "2", "var3", "3"),
X4=c("NA", "NA", "var2", "2")))
X1 X2 X3 X4
1 stn var1 var2 NA
2 s1 1 2 NA
3 stn var4 var3 var2
4 s2 2 3 2
How can I get the result:
b <- data.frame(list(stn=c("s1", "s2"),
var1=c(1, NA),
var2=c(2, 2),
var3=c(NA, 3),
var4=c(NA, 2)))
stn var1 var2 var3 var4
1 s1 1 2 NA NA
2 s2 NA 2 3 2
A (mostly) base R solution could be split the data.frame and call janitor::row_to_names(1) on each split data.frame, recombine it and remove bad columns using subset
do.call(dplyr::bind_rows,
df |>
split(cumsum(df$X1 == "stn")) |>
lapply(\(x) x |>
janitor::row_to_names(1))
) |>
subset(, -`NA`)
stn var1 var2 var4 var3
1 s1 1 2 <NA> <NA>
2 s2 <NA> 2 2 3
A dplyr solution:
library(dplyr)
a %>%
group_by(grp = rep(1:2, each = 2)) %>%
group_map(~ setNames(.x[-1, ], .x[1, ])) %>%
bind_rows() %>%
select(-`NA`) %>%
mutate(across(var1:var3, as.numeric))
# # A tibble: 2 × 5
# stn var1 var2 var4 var3
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 1 2 NA NA
# 2 s2 NA 2 2 3

Copy all pairwise combinations of dataframe and their combined name in a new dataframe

I was wondering how to generate a new dataframe containing all pairwise combinations of the value and the combined row and column name of another dataframe. To explain as an example I have the following dataframe:
# dataframe with col names a1:a5
df <- data.frame(a1 = c(4, 2, 6, 9, 13),
a2 = c(56, 1, 47, 2, 3),
a3 = c(4, 6, 9, 11, 85),
a4 = c(6, 15, 4, 12, 3),
a5 = c(54, 94, 3, 2, 75))
# and with rownames a1:a5
rownames(df) <- c("a1","a2","a3","a4","a5")
df now looks like this:
a1
a2
a3
a4
a5
a1
4
56
4
6
54
a2
2
1
6
15
94
a3
6
47
9
4
3
a4
9
2
11
12
2
a5
13
3
85
3
75
I need a new dataframe of all possible combinations (so 25x2) looking like this:
Step
Value
1
a1a1
4
2
a1a2
56
3
a1a3
4
4
a1a4
6
...
...
...
25
a5a5
75
Thank you.
You could convert the data to a table and back to a data.frame.
df2 <- as.data.frame(as.table(as.matrix(df)))
df2[order(df2$Var1), ]
# Var1 Var2 Freq
# 1 a1 a1 4
# 6 a1 a2 56
# 11 a1 a3 4
# 16 a1 a4 6
# 21 a1 a5 54
# 2 a2 a1 2
# 7 a2 a2 1
# 12 a2 a3 6
# 17 a2 a4 15
# 22 a2 a5 94
# ...
You can put it in a long format:
library(tidyr)
library(dplyr)
df %>%
# add as column row names
mutate(col1 = rownames(.)) %>%
# from wide to long format
pivot_longer( -col1, values_to = "Value", names_to = "col2") %>%
# create the combination in the format you need
mutate(step = paste0(col1,col2)) %>%
# select useful columns
select(step, Value) %>%
# sort by step
arrange(step)
# A tibble: 25 x 2
step Value
<chr> <dbl>
1 a1a1 4
2 a1a2 56
3 a1a3 4
4 a1a4 6
5 a1a5 54
6 a2a1 2
7 a2a2 1
8 a2a3 6
9 a2a4 15
10 a2a5 94
# ... with 15 more rows
We may use stack.
stack(dat)
# values ind
# 1 4 a1
# 2 2 a1
# 3 6 a1
# 4 9 a1
# 5 13 a1
# 6 56 a2
# 7 ...
Or, to be precise:
cbind(stack(as.data.frame(t(dat))), r=rownames(dat)) |>
transform(step=paste0(ind, r)) |> subset(select=c(4, 1))
# step values
# 1 a1a1 4
# 2 a1a2 56
# 3 a1a3 4
# 4 a1a4 6
# 5 a1a5 54
# 6 a2a1 2
# 7 ...
Data:
dat <- structure(list(a1 = c(4L, 2L, 6L, 9L, 13L), a2 = c(56L, 1L, 47L,
2L, 3L), a3 = c(4L, 6L, 9L, 11L, 85L), a4 = c(6L, 15L, 4L, 12L,
3L), a5 = c(54L, 94L, 3L, 2L, 75L)), class = "data.frame", row.names = c("a1",
"a2", "a3", "a4", "a5"))

How to check if pairs remain the same between years?

I have a data frame with pairs of individual birds (male and female) that were observed in several years. I am trying to figure out whether these pairs have changed from one year to the next so that I can do some further analyses.
My data is structured like this:
dat <- tibble(year = rep(1:3, each = 3),
Male = c("A1", "B1", "C1",
"A1", "B1", "C1",
"A1", "B1", "C2"),
Female = c("X1", "Y1", "Z1",
"X1", "Y2", "Z2",
"X1", "Y2", "Z2"))
# A tibble: 9 x 3
year Male Female
<int> <chr> <chr>
1 1 A1 X1
2 1 B1 Y1
3 1 C1 Z1
4 2 A1 X1
5 2 B1 Y2
6 2 C1 Z2
7 3 A1 X1
8 3 B1 Y2
9 3 C2 Z2
And my expected output is something like:
# A tibble: 9 x 5
year Male Female male_state female_state
<int> <chr> <chr> <chr> <chr>
1 1 A1 X1 new new
2 1 B1 Y1 new new
3 1 C1 Z1 new new
4 2 A1 X1 reunited reunited
5 2 B1 Y2 divorced new
6 2 C1 Z2 divorced new
7 3 A1 X1 reunited reunited
8 3 B1 Y2 reunited reunited
9 3 C2 Z2 new divorced
I cannot figure out how to check whether a value from a different column is the same in the year before (e.g. if the male ID is the same for a certain female in year 2 or 3 as in the year prior). Any ideas?
This (probably overcomplicated) pipe produces the following output.
dat <- tibble(year = rep(1:3, each = 3),
Male = c("A1", "B1", "C1",
"A1", "B1", "C1",
"A1", "B1", "C2"),
Female = c("X1", "Y1", "Z1",
"X1", "Y2", "Z2",
"X1", "Y2", "Z2"))
dat %>%
mutate(pair=paste0(Male,Female)) %>%
arrange(pair,year) %>%
mutate(check = if_else((pair==lag(pair)) & (year>lag(year)), 'old couple', 'new couple')) %>%
mutate(check = if_else(is.na(check), 'new couple', check)) %>%
mutate(divorced = if_else((Male == lag(Male)) & (Female != lag(Female)), 'divorce', '')) %>%
mutate(divorced = if_else(is.na(divorced), '', divorced))
OUTPUT:
# A tibble: 9 × 6
year Male Female pair check divorced
<int> <chr> <chr> <chr> <chr> <chr>
1 1 A1 X1 A1X1 new couple ""
2 2 A1 X1 A1X1 old couple ""
3 3 A1 X1 A1X1 old couple ""
4 1 B1 Y1 B1Y1 new couple ""
5 2 B1 Y2 B1Y2 new couple "divorce"
6 3 B1 Y2 B1Y2 old couple ""
7 1 C1 Z1 C1Z1 new couple ""
8 2 C1 Z2 C1Z2 new couple "divorce"
9 3 C2 Z2 C2Z2 new couple ""
Try this:
library(tidyverse)
dat <- tibble(
year = rep(1:3, each = 3),
Male = c(
"A1", "B1", "C1",
"A1", "B1", "C1",
"A1", "B1", "C2"
),
Female = c(
"X1", "Y1", "Z1",
"X1", "Y2", "Z2",
"X1", "Y2", "Z2"
)
)
dat |>
mutate(pairing = str_c(Male, "|", Female)) |>
add_count(pairing) |>
group_by(pairing) |>
mutate(male_state = if_else(pairing == lag(pairing), "reunited", NA_character_),
female_state = if_else(pairing == lag(pairing), "reunited", NA_character_)) |>
group_by(Male) |>
mutate(
male_state = if_else(row_number() == 1, "new", male_state),
male_state = if_else(is.na(male_state), "divorced", male_state)
) |>
group_by(Female) |>
mutate(
female_state = if_else(row_number() == 1, "new", female_state),
female_state = if_else(is.na(female_state), "divorced", female_state)
) |>
arrange(year, Male)
#> # A tibble: 9 × 7
#> # Groups: Female [5]
#> year Male Female pairing n male_state female_state
#> <int> <chr> <chr> <chr> <int> <chr> <chr>
#> 1 1 A1 X1 A1|X1 3 new new
#> 2 1 B1 Y1 B1|Y1 1 new new
#> 3 1 C1 Z1 C1|Z1 1 new new
#> 4 2 A1 X1 A1|X1 3 reunited reunited
#> 5 2 B1 Y2 B1|Y2 2 divorced new
#> 6 2 C1 Z2 C1|Z2 1 divorced new
#> 7 3 A1 X1 A1|X1 3 reunited reunited
#> 8 3 B1 Y2 B1|Y2 2 reunited reunited
#> 9 3 C2 Z2 C2|Z2 1 new divorced
Created on 2022-05-03 by the reprex package (v2.0.1)

R Count Values of every Column in Dataframe

I can do it using group by with only one Column but I want the Count for every Column in the Dataframe. See example:
Dataframe with similar values in each Column:
df =
C1 C2 C3
1 Positiv Negativ Neutral
2 NA Neutral Positiv
3 Positiv NA Negativ
4 Negativ Positiv NA
I thought about something like this:
df %>% group_by(names(df)) %>% count()
desired Output: Count Values in each Column
C1 C2 C3
1 Positive 2 1 1
2 Negative 1 1 1
3 Neutral 0 1 1
4 NA 1 1 1
Try this with some tidyverse functions and reshaping your data to long and wide:
library(dplyr)
library(tidyr)
#Code
new <- df %>% pivot_longer(everything()) %>%
group_by(across(everything())) %>%
summarise(N=n()) %>%
pivot_wider(names_from = name,values_from=N,values_fill=0)
Output:
# A tibble: 4 x 4
value C1 C2 C3
<chr> <int> <int> <int>
1 Negativ 1 1 1
2 Positiv 2 1 1
3 NA 1 1 1
4 Neutral 0 1 1
Some data used:
#Data
df <- structure(list(C1 = c("Positiv", NA, "Positiv", "Negativ"), C2 = c("Negativ",
"Neutral", NA, "Positiv"), C3 = c("Neutral", "Positiv", "Negativ",
NA)), class = "data.frame", row.names = c("1", "2", "3", "4"))
We can use base R with table
sapply(df, function(x) table(factor(x,
levels = c('Negativ', 'Positiv', 'Neutral')), useNA = "always"))
# C1 C2 C3
#Negativ 1 1 1
#Positiv 2 1 1
#Neutral 0 1 1
#<NA> 1 1 1
Or using a vectorized approach
table(unlist(df), c(col(df)), useNA = 'always')[, -4]
# 1 2 3
# Negativ 1 1 1
# Neutral 0 1 1
# Positiv 2 1 1
# <NA> 1 1 1
data
df <- structure(list(C1 = c("Positiv", NA, "Positiv", "Negativ"), C2 = c("Negativ",
"Neutral", NA, "Positiv"), C3 = c("Neutral", "Positiv", "Negativ",
NA)), class = "data.frame", row.names = c("1", "2", "3", "4"))

Convert column to comma separated in R

I have two columns A and B in excel with large data.we have to consider both columns A and B, I am trying to achieve column C as output. Right now I am doing everything in excel. So I think there may a way to this in R but really don't know how to do it.Any help is appreciated..Thanks
I have
Column A ColumnB Column C(output column)
A1 10 A2
A2 10 A1
B1 3 B2,B3,B4
B2 3 B1,B3,B4
B3 3 B1,B2,B4
B4 3 B1,B2,B3
C1 6 C2,C3
C2 6 C1,C3
C3 6 C1,C2
We can group by column B then find a set difference between the current column A character and the whole characters in the group:
library(tidyverse)
df %>%
group_by(ColumnB) %>%
mutate(ColumnC=map_chr(ColumnA, ~toString(setdiff(ColumnA, .x))))
# A tibble: 9 x 3
# Groups: ColumnB [3]
ColumnA ColumnB ColumnC
<fct> <int> <chr>
1 A1 10 A2
2 A2 10 A1
3 B1 3 B2, B3, B4
4 B2 3 B1, B3, B4
5 B3 3 B1, B2, B4
6 B4 3 B1, B2, B3
7 C1 6 C2, C3
8 C2 6 C1, C3
9 C3 6 C1, C2
I don't think the question is phrased very clearly but I am interpreting the desired results to be that you want Column C to have all the values from each group of Column B, leaving out the value of Column A. You can do this as follows:
nest Column A and join it back onto the original data frame
flatten it so you now have a vector of the Column A values
use setdiff to get the values that are not Column A
collapse into comma separated string with str_c
You can see that your desired Column C is reproduced.
library(tidyverse)
tbl <- structure(list(ColumnA = c("A1", "A2", "B1", "B2", "B3", "B4", "C1", "C2", "C3"), ColumnB = c(10L, 10L, 3L, 3L, 3L, 3L, 6L, 6L, 6L), ColumnC = c("A2", "A1", "B2,B3,B4", "B1,B3,B4", "B1,B2,B4", "B1,B2,B3", "C2,C3", "C1,C3", "C1,C2")), problems = structure(list(row = 9L, col = "ColumnC", expected = "", actual = "embedded null", file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame")), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(ColumnA = structure(list(), class = c("collector_character", "collector")), ColumnB = structure(list(), class = c("collector_integer", "collector")), ColumnC = structure(list(), class = c("collector_character", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
tbl %>%
left_join(
tbl %>% select(-ColumnC) %>% nest(ColumnA)
) %>%
mutate(
data = flatten(data),
output = map2(data, ColumnA, ~ setdiff(.x, .y)),
output = map_chr(output, ~ str_c(., collapse = ","))
)
#> Joining, by = "ColumnB"
#> # A tibble: 9 x 5
#> ColumnA ColumnB ColumnC data output
#> <chr> <int> <chr> <list> <chr>
#> 1 A1 10 A2 <chr [2]> A2
#> 2 A2 10 A1 <chr [2]> A1
#> 3 B1 3 B2,B3,B4 <chr [4]> B2,B3,B4
#> 4 B2 3 B1,B3,B4 <chr [4]> B1,B3,B4
#> 5 B3 3 B1,B2,B4 <chr [4]> B1,B2,B4
#> 6 B4 3 B1,B2,B3 <chr [4]> B1,B2,B3
#> 7 C1 6 C2,C3 <chr [3]> C2,C3
#> 8 C2 6 C1,C3 <chr [3]> C1,C3
#> 9 C3 6 C1,C2 <chr [3]> C1,C2
Created on 2018-08-21 by the reprex package (v0.2.0).
My understanding is to find all OTHER entries of column A that share the current value of column B
Grouping by B, and finding all A's associated with the value should do the trick (some clean-up afterward removes the current entry of A from the resulting column C)
a <- c("a1", "a2","b1", "b2","b3", "b4","c1","c2","c3","d1")
b <- c(10,10,3,3,3,3,6,6,6,5)
dta <- data.frame(a,b, stringsAsFactors = F)
dta<-dta %>%
group_by(b) %>%
mutate(c = paste0(a,collapse = ",")) %>%
ungroup() %>%
mutate(c = str_replace(c,pattern = paste0(",",a),replacement = "")) %>%
mutate(c = str_replace(c,pattern = paste0(a,","),replacement = "")) %>%
mutate(c = ifelse(c==a,NA,c))
Another version of tidyverse solution. The separate function is handy to separate an existing column to new columns. By doing this, we can create the Group column to make sure all the operation would be within each group. map2 and map function are ideal to do vectorized operation. dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
separate(ColumnA, into = c("Group", "Number"), remove = FALSE, convert = TRUE, sep = 1) %>%
group_by(Group) %>%
mutate(List = list(ColumnA)) %>%
mutate(List = map2(List, ColumnA, ~.x[!(.x %in% .y)])) %>%
mutate(ColumnC = map_chr(List, ~str_c(.x, collapse = ","))) %>%
ungroup() %>%
select(starts_with("Column"))
dat2
# # A tibble: 9 x 3
# ColumnA ColumnB ColumnC
# <chr> <int> <chr>
# 1 A1 10 A2
# 2 A2 10 A1
# 3 B1 3 B2,B3,B4
# 4 B2 3 B1,B3,B4
# 5 B3 3 B1,B2,B4
# 6 B4 3 B1,B2,B3
# 7 C1 6 C2,C3
# 8 C2 6 C1,C3
# 9 C3 6 C1,C2
DATA
dat <- read.table(text = "ColumnA ColumnB
A1 10
A2 10
B1 3
B2 3
B3 3
B4 3
C1 6
C2 6
C3 6",
stringsAsFactors = FALSE, header = TRUE)
df = read.table(text = "
ColumnA ColumnB
A1 10
A2 10
B1 3
B2 3
B3 3
B4 3
C1 6
C2 6
C3 6
", header=T, stringsAsFactors=F)
library(tidyverse)
df %>%
group_by(ColumnB) %>% # for each ColumnB value
mutate(vals = list(ColumnA), # create a list of all Column A values for each row
vals = map2(vals, ColumnA, ~.x[.x != .y]), # exclude the value in Column A from that list
vals = map_chr(vals, ~paste0(.x, collapse = ","))) %>% # combine remaining values in the list
ungroup() # forget the grouping
# # A tibble: 9 x 3
# ColumnA ColumnB vals
# <chr> <int> <chr>
# 1 A1 10 A2
# 2 A2 10 A1
# 3 B1 3 B2,B3,B4
# 4 B2 3 B1,B3,B4
# 5 B3 3 B1,B2,B4
# 6 B4 3 B1,B2,B3
# 7 C1 6 C2,C3
# 8 C2 6 C1,C3
# 9 C3 6 C1,C2

Resources