Convert information from rows to new columns - r

Is there a way in R to place every three values in the column "V" (below) to new columns? In others words, I need to reshape the data from long to wide, but only to three columns and where the values are what appears in column V. Below is a demonstration.
Thank you in advance!
data = structure(list(Key = c(200, 200, 200, 200, 200, 200, 300, 300,
300, 300, 300, 300, 400, 400, 400, 400, 400, 400),
V = c("a", "b", "c", "b", "d", "c", "d", "b", "c", "a", "f", "c", "d", "b",
"c", "a", "b", "c")),
row.names = c(NA, 18L),
class = "data.frame")

Here is one option
data %>%
group_by(Key) %>%
mutate(
grp = gl(n() / 3, 3),
col = c("x", "y", "z")[(row_number() + 2) %% 3 + 1]) %>%
group_by(Key, grp) %>%
spread(col, V) %>%
ungroup() %>%
select(-grp)
## A tibble: 6 x 4
# Key x y z
# <dbl> <chr> <chr> <chr>
#1 200 a b c
#2 200 b d c
#3 300 d b c
#4 300 a f c
#5 400 d b c
#6 400 a b c
Note: This assumes that the number of entries per Key is divisible by 3.
Instead of grp = gl(n() / 3, 3) you can also use grp = rep(1:(n() / 3), each = 3).
Update
In response to your comments, let's create sample data by removing some rows from data such that for Key = 200 and Key = 300 we don't have a multiple of 3 V entries.
data2 <- data %>% slice(-c(1, 8))
Then we can do
data2 %>%
group_by(Key) %>%
mutate(grp = gl(ceiling(n() / 3), 3)[1:n()]) %>%
group_by(Key, grp) %>%
mutate(col = c("x", "y", "z")[1:n()]) %>%
spread(col, V) %>%
ungroup() %>%
select(-grp)
## A tibble: 6 x 4
# Key x y z
# <dbl> <chr> <chr> <chr>
#1 200 b c b
#2 200 d c NA
#3 300 d c a
#4 300 f c NA
#5 400 d b c
#6 400 a b c
Note how "missing" values are filled with NA.

Related

Compute the difference between two columns by pair in R

I have the following data:
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
I want to "extend" this data frame to make name pairs for every possible combination of names without repetition like so:
names_1 <- c("a", "a", "a", "b", "b", "c")
names_2 <- c("b", "c", "d", "c", "d", "d")
scores_1 <- c(95, 95, 95, 55, 55, 100)
scores_2 <- c(55, 100, 60, 100, 60, 60)
df_extended <- cbind.data.frame(names_1, names_2, scores_1, scores_2)
In the extended data, scores_1 are the scores for the corresponding name in names_1, and scores_2 are for names_2.
The following bit of code makes the appropriate name pairs. But I do not know how to get the scores in the right place after that.
t(combn(df$names,2))
The final goal is to get the row-wise difference between scores_1 and scores_2.
df_extended$score_diff <- abs(df_extended$scores_1 - df_extended$scores_2)
df_ext <- data.frame(t(combn(df$names, 2,\(x)c(x, df$scores[df$names %in%x]))))
df_ext <- setNames(type.convert(df_ext, as.is =TRUE), c('name_1','name_2', 'type_1', 'type_2'))
df_ext
name_1 name_2 type_1 type_2
1 a b 95 55
2 a c 95 100
3 a d 95 60
4 b c 55 100
5 b d 55 60
6 c d 100 60
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
library(tidyverse)
map(df, ~combn(x = .x, m = 2)%>% t %>% as_tibble) %>%
imap_dfc(~set_names(x = .x, nm = paste(.y, seq(ncol(.x)), sep = "_"))) %>%
mutate(score_diff = scores_1 - scores_2)
#> # A tibble: 6 × 5
#> names_1 names_2 scores_1 scores_2 score_diff
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a b 95 55 40
#> 2 a c 95 100 -5
#> 3 a d 95 60 35
#> 4 b c 55 100 -45
#> 5 b d 55 60 -5
#> 6 c d 100 60 40
Created on 2022-06-06 by the reprex package (v2.0.1)
First, we can create a new data frame with the unique combinations of names. Then, we can merge on the scores to match the names for both names_1 and names_2 to get the final data.frame.
names <- c("a", "b", "c", "d")
scores <- c(95, 55, 100, 60)
df <- cbind.data.frame(names, scores)
new_df <- data.frame(t(combn(df$names,2)))
names(new_df)[1] <- "names_1"; names(new_df)[2] <- "names_2"
new_df <- merge(new_df, df, by.x = 'names_1', by.y = 'names')
new_df <- merge(new_df, df, by.x = 'names_2', by.y = 'names')
names(new_df)[3] <- "scores_1"; names(new_df)[4] <- "scores_2"
> new_df
names_2 names_1 scores_1 scores_2
1 b a 95 55
2 c a 95 100
3 c b 55 100
4 d a 95 60
5 d b 55 60
6 d c 100 60

R: Calculate percentage of observations in a column that are below a certain value for panel data

I have panel data and I would like to get the percentage of observations in a column (Size) that are below 1 million.
My data is the following:
structure(list(Product = c("A", "A", "A", "A", "A", "A", "B",
"B", "B", "B", "B", "B", "C", "C", "C", "C", "C", "C"), Date = c("02.05.2018",
"04.05.2018", "05.05.2018", "06.05.2018", "07.05.2018", "08.05.2018",
"02.05.2018", "04.05.2018", "05.05.2018", "06.05.2018", "07.05.2018",
"08.05.2018", "02.05.2018", "04.05.2018", "05.05.2018", "06.05.2018",
"07.05.2018", "08.05.2018"), Size = c(100023423, 1920, 2434324342,
2342353566, 345345345, 432, 1.35135e+11, 312332, 23434, 4622436246,
3252243, 234525, 57457457, 56848648, 36363546, 36535636, 2345,
2.52646e+11)), class = "data.frame", row.names = c(NA, -18L))
So for instance, for Product A it would be 33.33% since two out of 6 observations are below one million.
I have tried the following in R
df <- df %>%
group_by(Product) %>%
dplyr:: summarise(CountDate = n(), SmallSize = count(Size<1000000))
However, I get an error saying that "no applicable method for 'count' applied to an object of class "logical"" eventhough the column Size has the format double.
After the code above I would then calculate SmallSize/CountDate to get the percentage.
What do I need to adjust to not get the error message?
Instead of count, which requires a data.frame/tibble, use sum on a logical vector to get the count - TRUE values will be counted as 1 and FALSE as 0
library(dplyr)
df %>%
group_by(Product) %>%
dplyr:: summarise(CountDate = n(),
SmallSize = sum(Size<1000000, na.rm = TRUE), .groups = "drop") %>%
dplyr::mutate(Percent = SmallSize/CountDate)
# A tibble: 3 × 4
Product CountDate SmallSize Percent
<chr> <int> <int> <dbl>
1 A 6 2 0.333
2 B 6 3 0.5
3 C 6 1 0.167
Also, we don't need to create both the columns. It can be directly calculated with mean
df %>%
group_by(Product) %>%
dplyr::summarise(Percent = mean(Size < 1000000, na.rm = TRUE))
# A tibble: 3 × 2
Product Percent
<chr> <dbl>
1 A 0.333
2 B 0.5
3 C 0.167

Assign a value to a column in R based on a percentage within each group

[]
1I need to create column C in a data frame where 30% of the rows within each group (column B) get a value 0.
How do I do this in R?
We may use rbinom after grouping by 'category' column. Specify the prob as a vector of values
library(dplyr)
df1 %>%
group_by(category) %>%
mutate(value = rbinom(n(), 1, c(0.7, 0.3))) %>%
ungroup
-output
# A tibble: 9 x 3
sno category value
<int> <chr> <int>
1 1 A 1
2 2 A 0
3 3 A 1
4 4 B 1
5 5 B 0
6 6 B 1
7 7 C 1
8 8 C 0
9 9 C 0
data
df1 <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B",
"B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-9L))
If your data already exist (assuming this is a simplified answer), and if you want the value to be randomly assigned to each group:
library(dplyr)
d <- data.frame(sno = 1:9,
category = rep(c("A", "B", "C"), each = 3))
d %>%
group_by(category) %>%
mutate(value = sample(c(rep(1, floor(n()*.7)), rep(0, n() - floor(n()*.7)))))
Base R
set.seed(42)
d$value <- ave(
rep(0, nrow(d)), d$category,
FUN = function(z) sample(0:1, size = length(z), prob = c(0.3, 0.7), replace = TRUE)
)
d
# sno category value
# 1 1 A 0
# 2 2 A 0
# 3 3 A 1
# 4 4 B 0
# 5 5 B 1
# 6 6 B 1
# 7 7 C 0
# 8 8 C 1
# 9 9 C 1
Data copied from Brigadeiro's answer:
d <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA, -9L))

Summation of money amounts in character format by group

I have a data frame that contains the monetary transactions among individuals. The transactions can be two-way, i.e. A can transfer money to B and B can also transfer money to A. The structure of the data frame looks like below:
From To Amount
A B $100
A C $40
A D $30
B A $25
B C $70
C A $190
C D $110
I want to summarize the total amount of transactions among each pair of individuals who have transactions with each other and the results should be something like:
Individual_1 Individual_2 Sum
A B $125
A C $230
A D $30
B C $70
C D $110
I tried to utilize the grouping feature of the package dplyr but I think it does not apply to my case.
You can use pmin/pmax to sort From and To columns and sum the Amount value.
library(dplyr)
df %>%
group_by(col1 = pmin(From, To),
col2 = pmax(From, To)) %>%
summarise(Amount = sum(readr::parse_number(Amount)))
# col1 col2 Amount
# <chr> <chr> <dbl>
#1 A B 125
#2 A C 230
#3 A D 30
#4 B C 70
#5 C D 110
Using the same logic in base R you can do :
aggregate(Amount~col1 + col2,
transform(df, col1 = pmin(From, To), col2 = pmax(From, To),
Amount = as.numeric(sub('$', '', Amount, fixed = TRUE))), sum)
data
df <- structure(list(From = c("A", "A", "A", "B", "B", "C", "C"), To = c("B",
"C", "D", "A", "C", "A", "D"), Amount = c("$100", "$40", "$30",
"$25", "$70", "$190", "$110")), class = "data.frame", row.names = c(NA, -7L))
A solution using the tidyverse package. You need to find a way to create a common grouping column with the right order of the individuals. dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
mutate(Amount = as.numeric(str_remove(Amount, "\\$"))) %>%
mutate(Group = map2_chr(From, To, ~str_c(sort(c(.x, .y)), collapse = "_"))) %>%
group_by(Group) %>%
summarize(Sum = sum(Amount, na.rm = TRUE)) %>%
separate(Group, into = c("Individual_1", "Individual_2"), sep = "_") %>%
mutate(Sum = str_c("$", Sum))
print(dat2)
# # A tibble: 5 x 3
# Individual_1 Individual_2 Sum
# <chr> <chr> <chr>
# 1 A B $125
# 2 A C $230
# 3 A D $30
# 4 B C $70
# 5 C D $110
Data
dat <- read.table(text = "From To Amount
A B $100
A C $40
A D $30
B A $25
B C $70
C A $190
C D $110",
header = TRUE)
A complete solution without packages, based on #RonakShah's great pmin/pmax approach, using list notation in aggregate (in contrast to formula notation) which allows name assignment.
with(
transform(d, a=as.numeric(gsub("\\D", "", Amount)), b=pmin(From, To), c=pmax(From, To)),
aggregate(list(Sum=a), list(Individual_1=b, Individual_2=c), function(x)
paste0("$", sum(x))))
# Individual_1 Individual_2 Sum
# 1 A B $125
# 2 A C $230
# 3 B C $70
# 4 A D $30
# 5 C D $110
Data:
d <- structure(list(From = c("A", "A", "A", "B", "B", "C", "C"), To = c("B",
"C", "D", "A", "C", "A", "D"), Amount = c("$100", "$40", "$30",
"$25", "$70", "$190", "$110")), class = "data.frame", row.names = c(NA,
-7L))

Loop to Replace Matching Values

I'm looking for an easy and elegant way to accomplish this.
So if I have dataset x and relationship is A -> B -> Z -> Y and D -> H -> G, I would like to create dataset y. Unfortunately, they are not necessarily in order:
> x <- data.frame(
+ from = as.character(c("A", "E", "B", "D", "H", "Z")),
+ to = as.character(c("B", "E", "Z", "H", "G", "Y")))
>
> y <- data.frame(
+ from = as.character(c("A", "E", "B", "D", "H", "Z")),
+ to = as.character(c("Y", "E", "Y", "G", "G", "Y")))
>
> x
from to
1 A B
2 E E
3 B Z
4 D H
5 H G
6 Z Y
> y
from to
1 A Y
2 E E
3 B Y
4 D G
5 H G
6 Z Y
I have a fairly large dataset (currently 500k rows; will grow in the future) and actually care about the performance; I'm not sure if there are any other ways to do this without a for-loop or even to vectorize/parallelize the process.
I'm thinking about splitting and removing all rows where from == to or creating an indicator to skip certain rows so the loop does not have to go through the entire dataset each time.
I'd also like to know what the breakpoint should be if I do create a loop; I'm not sure how to define when the loop should stop.
Any suggestions would be appreciated. Thanks!
We can use dplyr to create a grouping variable by comparing the adjacent elements of 'to' and 'from' and change the values in 'to' the last element of 'to'
library(dplyr)
x %>%
group_by(grp = cumsum(lag(lead(from, default = last(from)) !=
as.character(to), default = TRUE))) %>%
mutate(to = last(to)) %>%
ungroup %>%
select(-grp)
# A tibble: 4 x 2
# from to
# <fctr> <fctr>
#1 A D
#2 B D
#3 C D
#4 E E
Another solution can be achieved using lag from dplyr and fill from tidyr as:
library(tidyverse)
x %>% arrange(from) %>%
mutate(samegroup = ifelse(from == lag(to), 1, 0)) %>%
mutate(group = ifelse(samegroup == 0 | is.na(samegroup), row_number(), NA)) %>%
fill(group) %>%
group_by(group) %>%
mutate(to = last(to)) %>%
ungroup() %>%
select(-samegroup, - group)
# A tibble: 6 x 2
# from to
# <chr> <chr>
#1 A D
#2 B D
#3 C D
#4 E E
#5 F H
#6 G H
Data used
x <- data.frame(from = as.character(c("A", "B", "F", "C", "G", "E")),
to = as.character(c("B", "C", "G", "D", "H", "E")),
stringsAsFactors = FALSE)

Resources