Conditional replacing of a numeric value in dplyr - r

Dear all I have a data frame that looks like this
df <- data.frame(time=c(1,2,3,4,1,2,3,4,5), type=c("A","A","A","A","B","B","B","B","B"), count=c(10,0,0,1,8,0,1,0,1))
df
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 1
5 1 B 8
6 2 B 0
7 3 B 1
8 4 B 0
9 5 B 1
I want to examine each group of types and if I see that one count is 0 then to replace the next count forward in time with 0. I do not count to be resurrected from the zero.
I want my data to looks like this
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 0
5 1 B 8
6 2 B 0
7 3 B 0
8 4 B 0
9 5 B 0

If I understood correctly
library(tidyverse)
df <-
data.frame(
time = c(1, 2, 3, 4, 1, 2, 3, 4, 5),
type = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
count = c(10, 0, 0, 1, 8, 0, 1, 0, 1)
)
df %>%
group_by(type) %>%
mutate(count = if_else(lag(count, default = first(count)) == 0, 0, count))
#> # A tibble: 9 x 3
#> # Groups: type [2]
#> time type count
#> <dbl> <chr> <dbl>
#> 1 1 A 10
#> 2 2 A 0
#> 3 3 A 0
#> 4 4 A 0
#> 5 1 B 8
#> 6 2 B 0
#> 7 3 B 0
#> 8 4 B 0
#> 9 5 B 0
Created on 2021-09-10 by the reprex package (v2.0.1)

You may use cummin function.
library(dplyr)
df %>% group_by(type) %>% mutate(count = cummin(count))
# time type count
# <dbl> <chr> <dbl>
#1 1 A 10
#2 2 A 0
#3 3 A 0
#4 4 A 0
#5 1 B 8
#6 2 B 0
#7 3 B 0
#8 4 B 0
#9 5 B 0
Since cummin is a base R function you may also implement it in base R -
transform(df, count = ave(count, type, FUN = cummin))

Related

Group repeated values with random breaks in R

I have the following data frame called df (dput below):
group value
1 A 0
2 A 24
3 A 0
4 A 24
5 A 0
6 A 0
7 B 0
8 B 24
9 B 0
10 B 0
11 B 24
12 B 0
I would like to group the repeated values per group when the order is 0->24. Sometimes there is a random 0 with no 24 after. The desired output should look like this:
group value subgroup
1 A 0 1
2 A 24 1
3 A 0 2
4 A 24 2
5 A 0 3
6 A 0 4
7 B 0 1
8 B 24 1
9 B 0 2
10 B 0 3
11 B 24 3
12 B 0 4
As you can see for rows 5 and 9 there is no 24 after it, that's why they have grouped alone. So I was wondering if anyone knows how to group repeated values with some random breaks in R?
dput df:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), value = c(0, 24, 0, 24, 0, 0, 0, 24, 0,
0, 24, 0)), class = "data.frame", row.names = c(NA, -12L))
Looks like the subgroup increments whenever there is a 0 value:
df %>%
group_by(group) %>%
mutate(subgroup = cumsum(value == 0)) %>%
ungroup()
# # A tibble: 12 × 3
# group value subgroup
# <chr> <dbl> <int>
# 1 A 0 1
# 2 A 24 1
# 3 A 0 2
# 4 A 24 2
# 5 A 0 3
# 6 A 0 4
# 7 B 0 1
# 8 B 24 1
# 9 B 0 2
# 10 B 0 3
# 11 B 24 3
# 12 B 0 4

Combine csv files with redundant columns R

I have a series of .csv files that look like this :
a.csv contains
id, a, b, c
1, 10, 0, 0
2, 3, 0 , 0
3, 20, 0, 0
b.csv contains
id, a, b, c
1, 0, 7, 0
2, 0, 9, 0
3, 0, 14, 0
c.csv contains
id, a, b, c
1, 0, 0, 12
2, 0, 0, 8
3, 0, 0, 22
I'm trying to figure out the most efficient way to read them in and create a dataframe that looks like this
id, a, b, c
1, 10, 7, 12
2, 3, 9, 8
3, 20, 14, 22
What would be the best way to do this if there are many more files with many more columns and rows? tidyverse is preferred.
How about this. If all redundant columns have zeros, then you can go long, filter out the zeros, bind the rows, and then go wide.
library(tidyverse)
df_a <- read_table("id a b c
1 10 0 0
2 3 0 0
3 20 0 0")
df_b <- read_table("id a b c
1 0 7 0
2 0 9 0
3 0 14 0")
df_c <- read_table("id a b c
1 0 0 12
2 0 0 8
3 0 0 22")
list(df_a, df_b, df_c)|>
map(\(d) pivot_longer(d, cols = -id) |>
filter(value >0)) |>
bind_rows() |>
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
Or better yet, read in the data marking 0 as NA and then coalesce the data frames.
df_a <- read_table("id a b c
1 10 0 0
2 3 0 0
3 20 0 0", na = "0")
df_b <- read_table("id a b c
1 0 7 0
2 0 9 0
3 0 14 0", na = "0")
df_c <- read_table("id a b c
1 0 0 12
2 0 0 8
3 0 0 22", na = "0")
coalesce(df_a, df_b, df_c)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
Or if you can read the data in with NA, you can define 0 as NA:
list(df_a, df_b, df_c) |>
map(\(d) mutate(d, across(everything(), \(x) ifelse(x == 0, NA, x)))) |>
reduce(coalesce)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
A base R solution, given the symmetry of the files.
Read the files
file_names <- list.files(pattern="^[abc]\\.csv")
lis <- sapply(file_names, function(x) list(read.csv(x, header=T)))
lis
$a.csv
id a b c
1 1 10 0 0
2 2 3 0 0
3 3 20 0 0
$b.csv
id a b c
1 1 0 7 0
2 2 0 9 0
3 3 0 14 0
$c.csv
id a b c
1 1 0 0 12
2 2 0 0 8
3 3 0 0 22
Combine the columns
column_names <- c("a","b","c")
cbind( lis[["a.csv"]]["id"], sapply(lis, function(x) rowSums(x[column_names])) )
id a.csv b.csv c.csv
1 1 10 7 12
2 2 3 9 8
3 3 20 14 22

How can I subtract values within one column based on values in mutliple other columns?

I have a dataframe like this:
dat <- data.frame(c = c(rep(0, 3), rep(5, 3), rep(10, 3)),
id = c(rep(c("A","B","C"), 3)),
measurement = c(1:8, 1))
dat
# c id measurement
# 1 0 A 1
# 2 0 B 2
# 3 0 C 3
# 4 5 A 4
# 5 5 B 5
# 6 5 C 6
# 7 10 A 7
# 8 10 B 8
# 9 10 C 1
I want to subtract the values in the column "measurement" where c is 0 from all other values in this column. This should happen separately based on the info given in the column "id". E.g. the value where c is 0 and "id" is A should be subtracted from all values where c is > 0 and "id" is A. The value where c is 0 and "id" is B should be subtracted from all values where c is > 0 and "id" is B and so on.
If the difference would be negative the result should be 0.
The result should look like this:
result <- data.frame(c = c(rep(0, 3), rep(5, 3), rep(10, 3)),
id = c(rep(c("A","B","C"), 3)),
measurement = c(1:8, 1),
difference = c(0,0,0,3,3,3,6,6,0))
result
# c id measurement difference
# 1 0 A 1 0
# 2 0 B 2 0
# 3 0 C 3 0
# 4 5 A 4 3
# 5 5 B 5 3
# 6 5 C 6 3
# 7 10 A 7 6
# 8 10 B 8 6
# 9 10 C 1 0
I used dplyr to select the values of "measurement" based on the info from the other columns, but unfortunately I don't know how to do the calculations. So any suggestions are welcome!
For each id you can subtract measurement values with the value where c = 0. Using pmax we replace negative values with 0.
library(dplyr)
dat %>%
group_by(id) %>%
mutate(difference = pmax(measurement - measurement[c == 0], 0))
# c id measurement difference
# <dbl> <chr> <dbl> <dbl>
#1 0 A 1 0
#2 0 B 2 0
#3 0 C 3 0
#4 5 A 4 3
#5 5 B 5 3
#6 5 C 6 3
#7 10 A 7 6
#8 10 B 8 6
#9 10 C 1 0
Try this. You can use a join and filter the data for you defined filter. After that dplyr verbs are useful to reach the expected output:
library(dplyr)
#Code
new <- dat %>%
left_join(
dat %>% filter(c==0) %>% select(-c) %>% rename(Var=measurement)
) %>%
mutate(measurement=measurement-Var) %>%
replace(.<=0,0) %>% select(-Var)
Output:
c id measurement
1 0 A 0
2 0 B 0
3 0 C 0
4 5 A 3
5 5 B 3
6 5 C 3
7 10 A 6
8 10 B 6
9 10 C 0

How can I distribute a vector of numbers by a vector of percentages, round the result, and always get the same total that I started with in R?

Question Summary
I want to multiply a vector of numbers (the Sum_By_Group column) by a vector of percentages (the Percent column) to distribute the total number for the group into each ID, round the result, and end up with the same total number that I started with. In other words, I want the Distribution_Post_Round column to be the same as the Sum_By_Group column.
Below is an example of the issue that I am running into. In Group A, I multiply Percent by Sum_By_Group and finish with 3 in ID 1, 3 in ID 2, and 1 in ID 5 for a total of 7. The Sum_By_Group column and Distribution_Post_Round column are the same for Group A and this is what I want. In Group B, I multiply Percent by Sum_By_Group and finish with 1 in ID 8 and 1 in ID 10 for a total of 2. I want the Distribution_Post_Round column to be 3 for Group B.
Is there a way to do this without using loops, subsetting data frames, and then rejoining the data frames together?
Example
library(dplyr)
df = data.frame('Group' = c(rep('A', 7), rep('B', 5)),
'ID' = c(1:12),
'Percent' = c(0.413797750, 0.385366840, 0.014417571, 0.060095668, 0.076399650,
0.019672573, 0.030249949, 0.381214519, 0.084121796, 0.438327886,
0.010665749, 0.085670050),
'Sum_By_Group' = c(rep(7,7), rep(3, 5)))
df$Distribute_By_ID = round(df$Percent * df$Sum_By_Group, 0)
df_round = aggregate(Distribute_By_ID ~ Group, data = df, sum)
names(df_round)[names(df_round) == 'Distribute_By_ID'] = 'Distribution_Post_Round'
df = left_join(df, df_round, by = 'Group')
df
Group ID Percent Sum_By_Group Distribute_By_ID Distribution_Post_Round
A 1 0.41379775 7 3 7
A 2 0.38536684 7 3 7
A 3 0.01441757 7 0 7
A 4 0.06009567 7 0 7
A 5 0.07639965 7 1 7
A 6 0.01967257 7 0 7
A 7 0.03024995 7 0 7
B 8 0.38121452 3 1 2
B 9 0.08412180 3 0 2
B 10 0.43832789 3 1 2
B 11 0.01066575 3 0 2
B 12 0.08567005 3 0 2
Thank you so much for your help. Please let me know if additional clarification is needed.
Wow, who knew someone had already written a package that includes a function to solve this... kudos to that team https://cran.r-project.org/web/packages/sfsmisc/index.html
Since you seem to be willing to use dplyr hopefully this additional package will be worth it because it certainly makes a solution elegant.
# https://stackoverflow.com/questions/61667720
library(dplyr)
df = data.frame('Group' = c(rep('A', 7), rep('B', 5)),
'ID' = c(1:12),
'Percent' = c(0.413797750, 0.385366840, 0.014417571, 0.060095668, 0.076399650,
0.019672573, 0.030249949, 0.381214519, 0.084121796, 0.438327886,
0.010665749, 0.085670050),
'Sum_By_Group' = c(rep(7,7), rep(3, 5)))
glimpse(df)
#> Rows: 12
#> Columns: 4
#> $ Group <chr> "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "…
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
#> $ Percent <dbl> 0.41379775, 0.38536684, 0.01441757, 0.06009567, 0.076399…
#> $ Sum_By_Group <dbl> 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3
df %>%
group_by(Group) %>%
mutate(Distribute_By_ID = sfsmisc::roundfixS(Percent * Sum_By_Group))
#> # A tibble: 12 x 5
#> # Groups: Group [2]
#> Group ID Percent Sum_By_Group Distribute_By_ID
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 A 1 0.414 7 3
#> 2 A 2 0.385 7 3
#> 3 A 3 0.0144 7 0
#> 4 A 4 0.0601 7 0
#> 5 A 5 0.0764 7 1
#> 6 A 6 0.0197 7 0
#> 7 A 7 0.0302 7 0
#> 8 B 8 0.381 3 1
#> 9 B 9 0.0841 3 0
#> 10 B 10 0.438 3 2
#> 11 B 11 0.0107 3 0
#> 12 B 12 0.0857 3 0
Created on 2020-05-07 by the reprex package (v0.3.0)
df %>%
mutate(dividend = floor(Percent*Sum_By_Group),
remainder= Percent*Sum_By_Group-dividend) %>%
group_by(Group) %>%
arrange(desc(remainder),.by_group=TRUE) %>%
mutate(delivered=sum(dividend),
rownumber=1:n(),
lastdelivery=if_else(rownumber<=Sum_By_Group-delivered,1,0),
Final=dividend+lastdelivery) %>%
ungroup()
# A tibble: 12 x 10
Group ID Percent Sum_By_Group dividend remainder delivered rownumber lastdelivery Final
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 A 1 0.414 7 2 0.897 4 1 1 3
2 A 2 0.385 7 2 0.698 4 2 1 3
3 A 5 0.0764 7 0 0.535 4 3 1 1
4 A 4 0.0601 7 0 0.421 4 4 0 0
5 A 7 0.0302 7 0 0.212 4 5 0 0
6 A 6 0.0197 7 0 0.138 4 6 0 0
7 A 3 0.0144 7 0 0.101 4 7 0 0
8 B 10 0.438 3 1 0.315 2 1 1 2
9 B 12 0.0857 3 0 0.257 2 2 0 0
10 B 9 0.0841 3 0 0.252 2 3 0 0
11 B 8 0.381 3 1 0.144 2 4 0 1
12 B 11 0.0107 3 0 0.0320 2 5 0 0
This is my solution, without any other dependencies relying on Hare quota :
I distributed all the integer "seats", then I distributed the remaining "seats" in the order of remainders.
The column "Final" is then OK.
Note : It seems to give the same results than the other solution with a package
Formulating this as an integer optimization problem:
library(CVXR)
A <- as.data.frame.matrix(t(model.matrix(~0+Group, df)))
prop <- df$Percent * df$Sum_By_Group
x <- Variable(nrow(df), integer=TRUE)
sums <- df$Sum_By_Group[!duplicated(df$Group)]
p <- Problem(Minimize(sum_squares(x - prop)), list(A %*% x == sums))
result <- solve(p)
df$Distribute_By_ID <- as.integer(round(result$getValue(x)))
output:
Group ID Percent Sum_By_Group
1 A 1 0.41379775 7
2 A 2 0.38536684 7
3 A 3 0.01441757 7
4 A 4 0.06009567 7
5 A 5 0.07639965 7
6 A 6 0.01967257 7
7 A 7 0.03024995 7
8 B 8 0.38121452 3
9 B 9 0.08412180 3
10 B 10 0.43832789 3
11 B 11 0.01066575 3
12 B 12 0.08567005 3

R: new column of row difference from max value of another column according to group

The title of the question may be unclear but I hope these codes will clearly demonstrate my problem.
I have a data frame with three columns. $sensor (A and B); $hour of the day (0-4); and the $value taken by the temperature (1-5).
new.df <- data.frame(
sensor = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
hour_day = c(0:4, 0:4),
value = c(1, 1, 3, 1, 2, 1, 3, 4, 5, 2)
new.df
sensor hour_day value
1 A 0 1
2 A 1 1
3 A 2 3
4 A 3 1
5 A 4 2
6 B 0 1
7 B 1 3
8 B 2 4
9 B 3 5
10 B 4 2
I want to make a new column that indicates the difference in hour from the hour with maximum value according to the sensor.
Desired result
sensor value hour_day hour_from_max_hour
1 A 1 0 -2
2 A 1 1 -1
3 A 3 2 0
4 A 1 3 1
5 A 2 4 2
6 B 1 0 -3
7 B 3 1 -2
8 B 4 2 -1
9 B 5 3 0
10 B 2 4 1
Note that for sensor A (max = hour 2), and sensor B (max = hour 3). I just want a new column that tells me how many hour different is that sensor-value group is from the max sensor-value.
Thank you in advance and please let me know if I can provide more information.
EDIT
Previous answer were very helpful, I forgot that there is one more variable (day) in this problem. Also, some times there is more than one maximum in a column. When this is the case, I would like to base the difference on the first maximum.
df_add <- data.frame(
sensor = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
hour_day = c(0:4, 0:4, 0:4, 0:4),
value = c(1, 1, 3, 3, 2,
3, 2, 4, 4, 1,
1, 5, 6, 6, 2,
2, 1, 3, 3, 1),
day = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
2, 2, 2, 2, 2)
)
df_add
sensor hour_day value day
1 A 0 1 1
2 A 1 1 1
3 A 2 3 1
4 A 3 3 1
5 A 4 2 1
6 B 0 3 1
7 B 1 2 1
8 B 2 4 1
9 B 3 4 1
10 B 4 1 1
11 A 0 1 2
12 A 1 5 2
13 A 2 6 2
14 A 3 6 2
15 A 4 2 2
16 B 0 2 2
17 B 1 1 2
18 B 2 3 2
19 B 3 3 2
20 B 4 1 2
A simple pipe can do it. All you have to do is to get max(value) in the mutate instruction.
new.df %>%
group_by(sensor) %>%
mutate(hour_from_max_hour = hour_day - hour_day[which(value == max(value))[1]])
## A tibble: 10 x 4
## Groups: sensor [2]
# sensor hour_day value hour_from_max_hour
# <fct> <int> <dbl> <int>
# 1 A 0 1. -2
# 2 A 1 1. -1
# 3 A 2 3. 0
# 4 A 3 1. 1
# 5 A 4 2. 2
# 6 B 0 1. -3
# 7 B 1 3. -2
# 8 B 2 4. -1
# 9 B 3 5. 0
#10 B 4 2. 1
library(dplyr)
new.df.2 <-
# First get the hours with the max values
new.df %>%
group_by(sensor) %>%
filter(value == max(value)) %>%
ungroup() %>%
select(sensor, max_hour = hour_day) %>% # This renames hour_day as max_hour
# Now join that to the original table and make the calculation
right_join(new.df) %>%
mutate(hour_from_max_hour = hour_day - max_hour)
Result:
new.df.2
# A tibble: 10 x 5
sensor max_hour hour_day value hour_from_max_hour
<fct> <int> <int> <dbl> <int>
1 A 2 0 1 -2
2 A 2 1 1 -1
3 A 2 2 3 0
4 A 2 3 1 1
5 A 2 4 2 2
6 B 3 0 1 -3
7 B 3 1 3 -2
8 B 3 2 4 -1
9 B 3 3 5 0
10 B 3 4 2 1
This is probably how I would do it:
library(plyr)
dd = ddply(new.df, .(sensor), summarize,
max.value = max(value),
hour.of.max = hour_day[which.max(value)])
new.df = merge(new.df, dd, all.x=T, by='sensor')
new.df$hour_from_max_hour = new.df$hour_day - new.df$hour.of.max
Gave you a couple extra columns, but you can delete them:
sensor hour_day value max.value hour.of.max hour_from_max_hour
1 A 0 1 3 2 -2
2 A 1 1 3 2 -1
3 A 2 3 3 2 0
4 A 3 1 3 2 1
5 A 4 2 3 2 2
6 B 0 1 5 3 -3
7 B 1 3 5 3 -2
8 B 2 4 5 3 -1
9 B 3 5 5 3 0
10 B 4 2 5 3 1

Resources