Divide column from dataframe into another - r

I've got 2 data frames that I'm trying to divide by each other but it's not working for me. Both dataframes are 8 x 3 with column one the same for both, column names are also the same for both data frames
bal_tier[,c(1, 3:4)]
# A tibble: 8 x 3
# Groups: hierachy_level2 [8]
hierachy_level2 `201804` `201904`
<chr> <dbl> <dbl>
1 CS 239 250
2 FNZ 87 97
3 OPS 1057 1136.
4 P&T 256 279
5 R&A 520 546
6 SPE 130 136.
7 SPP 67 66
8 TUR 46 69
dput(bal_tier[,c(1, 3:4)])
structure(list(hierachy_level2 = c("CS", "FNZ", "OPS", "P&T",
"R&A", "SPE", "SPP", "TUR"), `201804` = c(239, 87, 1057, 256,
520, 130, 67, 46), `201904` = c(250, 97, 1136.5, 279, 546, 136.5,
66, 69)), row.names = c(NA, -8L), groups = structure(list(hierachy_level2 = c("CS",
"FNZ", "OPS", "P&T", "R&A", "SPE", "SPP", "TUR"), .rows = list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
tier_leavers[,c(1, 3:4)]
# A tibble: 8 x 3
# Groups: hierachy_level2 [8]
hierachy_level2 `201804` `201904`
<chr> <dbl> <dbl>
1 CS 32 47
2 FNZ 1 11
3 OPS 73 76
4 P&T 48 33
5 R&A 41 33
6 SPE 28 30
7 SPP 10 12
8 TUR 2 3
dput(tier_leavers[,c(1, 3:4)])
structure(list(hierachy_level2 = c("CS", "FNZ", "OPS", "P&T",
"R&A", "SPE", "SPP", "TUR"), `201804` = c(32, 1, 73, 48, 41,
28, 10, 2), `201904` = c(47, 11, 76, 33, 33, 30, 12, 3)), row.names = c(NA,
-8L), groups = structure(list(hierachy_level2 = c("CS", "FNZ",
"OPS", "P&T", "R&A", "SPE", "SPP", "TUR"), .rows = list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Doing this gives me what I want:
bal_tier[,1]
# A tibble: 8 x 1
# Groups: hierachy_level2 [8]
hierachy_level2
<chr>
1 CS
2 FNZ
3 OPS
4 P&T
5 R&A
6 SPE
7 SPP
8 TUR
(tier_leavers[,c(3:4)] / bal_tier[,c(3:4)])
201804 201904
1 0.13389121 0.18800000
2 0.01149425 0.11340206
3 0.06906339 0.06687198
4 0.18750000 0.11827957
5 0.07884615 0.06043956
6 0.21538462 0.21978022
7 0.14925373 0.18181818
8 0.04347826 0.04347826
but when I combine it in a cbind I end up with this:
cbind(bal_tier[,1], tier_leavers[,c(3:4)] / bal_tier[,c(3:4)])
[,1] [,2]
201804 Character,8 Numeric,8
201904 Character,8 Numeric,8
What am I understanding wrong here?

Here's a solution using tidyverse
nme <- c("A","B","C","D","E")
yr_1 <- round(10*runif(n=5,min=0,max=10),0)
yr_2 <- round(10*runif(n=5,min=0,max=10),0)
data_1 <- data.frame(nme,yr_1,yr_2)
yr_1 <- round(10*runif(n=5,min=0,max=10),0)
yr_2 <- round(10*runif(n=5,min=0,max=10),0)
data_2 <- data.frame(nme,yr_1,yr_2)
data_divide <- data_1 %>%
left_join(data_2,by="nme") %>%
mutate(
result_1=yr_1.x/yr_1.y,
result_2=yr_2.x/yr_2.y
)

What I ended up doing feels like cheating but I got a clue from Zeus's answer:
a <- bal_tier[, 1]
b <- tier_leavers[,c(3:4)] / bal_tier[,c(3:4)]
tier_to <- data.frame(a, b)
tier_to
> tier_to
hierachy_level2 X201804 X201904
1 CS 0.13389121 0.18800000
2 FNZ 0.01149425 0.11340206
3 OPS 0.06906339 0.06687198
4 P&T 0.18750000 0.11827957
5 R&A 0.07884615 0.06043956
6 SPE 0.21538462 0.21978022
7 SPP 0.14925373 0.18181818
8 TUR 0.04347826 0.04347826

Related

How to join two dataframes containing time varying variables in R

This seems like a simple enough thing but I can't figure it out nor find an answer online - apologies if it something obvious. I have two seperate dataframes containing the same patients with the same unique identifier. Both datasets have time varying variables - one continuous and one categorical and the time to each reading is different in the sets but have a common start point at time 1. I have tried to modify the tmerge function from survival package but without luck as I don't have a dichotomous outcome variable nor a single data set with one row per patient.
Reprex for creating the datasets below (df1 and df2) and an example of my desired combined output table for a single patient (ID 3), output gets very long if done for all 4 patients
Thanks for any possible help
df1 <- structure(list(tstart = c(1, 1, 1, 1426, 1, 560, 567), tstop = c(2049,
3426, 1426, 1707, 560, 567, 4207), category = structure(c(1L,
1L, 1L, 2L, 1L, 4L, 2L), .Label = c("none", "high", "low", "moderate"
), class = "factor"), id = c(1L, 2L, 3L, 3L, 4L, 4L, 4L)), row.names = c(NA,
-7L), class = c("tbl_df", "tbl", "data.frame"))
df2 <- structure(list(tstart = c(1, 365, 730, 1, 365, 730, 1096, 2557,
1, 365, 730, 1096, 1826, 2557, 3652, 1), tstop = c(365, 730,
1096, 365, 730, 1096, 2557, 2582, 365, 730, 1096, 1826, 2557,
3652, 4864, 365), egfr = c(66, 62, 58, 54, 50, 43, 49, 51, 106,
103, 80, 92, 97, 90, 81, 51), id = c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
df_example_patient_3 <- structure(list(id = c(3L, 3L, 3L,
3L, 3L, 3L,3L, 3L, 3L), tstart = c(1, 365, 730, 1096, 1426, 1707, 1826, 2557, 3652), tstop = c(365, 730,
1096, 1426, 1707, 1826, 2557, 3652, 4864), egfr = c(106, 103, 80, 92, 92, 92, 97, 90, 81), category = c("none", "none", "none", "none", "high", "high", "high", "high", "high")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
# DF1
tstart tstop category id
<dbl> <dbl> <fct> <int>
1 1 2049 none 1
2 1 3426 none 2
3 1 1426 none 3
4 1426 1707 high 3
5 1 560 none 4
6 560 567 moderate 4
7 567 4207 high 4
# DF2
tstart tstop egfr id
<dbl> <dbl> <dbl> <int>
1 1 365 66 1
2 365 730 62 1
3 730 1096 58 1
4 1 365 54 2
5 365 730 50 2
6 730 1096 43 2
7 1096 2557 49 2
8 2557 2582 51 2
9 1 365 106 3
10 365 730 103 3
11 730 1096 80 3
12 1096 1826 92 3
13 1826 2557 97 3
14 2557 3652 90 3
15 3652 4864 81 3
16 1 365 51 4
# Combined set
id tstart tstop egfr category
<int> <dbl> <dbl> <dbl> <chr>
1 3 1 365 106 none
2 3 365 730 103 none
3 3 730 1096 80 none
4 3 1096 1426 92 none
5 3 1426 1707 92 high
6 3 1707 1826 92 high
7 3 1826 2557 97 high
8 3 2557 3652 90 high
9 3 3652 4864 81 high
I had to do it this way to really work out the details.
First, i construct a full df1 with all the timestamps, including those of df2.
then i proceed with multiple merges. This is not elegant, but it works:
library(data.table)
library(zoo)
# Proper data.tables
setDT(df1, key = c("id", "tstart"))
setDT(df2, key = c("id", "tstart"))
timestamps_by_id <- unique(rbind(
df1[, .(id, tstart)],
df1[, .(id, tstop)],
df2[, .(id, tstart)],
df2[, .(id, tstop)],
use.names = F
))
setorder(timestamps_by_id, id, tstart)
# Merge to construct full df1
df1_full <- df1[timestamps_by_id]
df1_full[, category := na.locf(category), by = id]
df1_full[, tstop := shift(tstart, -1), by = id]
setkey(df1_full, id, tstart)
# Merge with df2
result <- na.omit(df2[df1_full, roll = T])
result[, tstop := i.tstop]
print(result[id == 3, .(id, tstart, tstop, egfr, category)])
Or a more data.tabley solution using the more arcane foverlaps:
library(data.table)
# Proper data.tables
setDT(df1, key = c("id", "tstart", "tstop"))
setDT(df2, key = c("id", "tstart", "tstop"))
# We add an infinite upper range
proper_df1 <- rbind(
df1,
df1[, .SD[which.max(tstop)], by = .(id)][, .(id, tstart = tstop, tstop = Inf, category), ]
)
setkey(proper_df1, id, tstart, tstop)
overlaps <- foverlaps(df2, proper_df1, type = "any") # Overlap join
overlaps[
tstart %between% .(i.tstart, i.tstop) & tstart != 1,
i.tstart := tstart
]
overlaps[tstop %between% .(i.tstart, i.tstop), i.tstop := tstop]
print(overlaps[
id == 3,
.(id, "tstart" = i.tstart, "tstop" = i.tstop, category, egfr)
])
This messy dplyr solution seems to work for this particular dataset but don't know would it work for all datasets, the direction of the fill may need to be altered depending on particular dataset
library(tidyverse)
library(magrittr)
df1 %>%
bind_rows(df2) %>%
group_by(id) %>%
arrange(id, tstop) %>%
mutate(
tstart = case_when(
tstart < lag(tstop) ~ lag(tstop), TRUE ~ tstart)) %>%
fill(egfr, category, .direction = "updown") %>%
ungroup() %>%
filter(id == 3)
tstart tstop category id egfr
<dbl> <dbl> <fct> <int> <dbl>
1 1 365 none 3 106
2 365 730 none 3 103
3 730 1096 none 3 80
4 1096 1426 none 3 92
5 1426 1707 high 3 92
6 1707 1826 high 3 92
7 1826 2557 high 3 97
8 2557 3652 high 3 90
9 3652 4864 high 3 81

How to convert dot in comma in data.frame

I would like to convert the dot (decimal separator) to comma as decimal separator.
I tried using format(decimal.mark=",") but got an error.
df<-structure(list(ponto = c("F01", "F02", "F03", "F04", "F05", "F06"
), `Vegetação Nativa` = c(0.09, 3.12, 8.22, 5.92, 1.95, 4.7),
Agricultura = c(91.78, 91.87, 100, 100, 91.5, 99.38), Pastagem = c(-16.99,
-33.16, -22.73, -24.12, -38, -47.3), `Área Urbana` = c(27.32,
27.32, 27.57, 27.57, 19.18, NaN), `Solo Exposto` = c(10.04,
2.13, 8.5, 6.64, -29.35, -442.86), `Corpo Hídrico` = c(-15.62,
-15.62, NaN, NaN, -17.11, -25.93)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
ponto = c("F01", "F02", "F03", "F04", "F05", "F06"), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), .drop = TRUE))
I tried this, but got an error:
df%>%
format(decimal.mark=",")
One way is to use mutate and across from dplyr. Though this will still change their type to character.
library(dplyr)
df %>%
mutate(across(everything(), format, decimal.mark = ","))
Output
# A tibble: 6 × 7
# Groups: ponto [6]
ponto `Vegetação Nativa` Agricultura Pastagem `Área Urbana` `Solo Exposto` `Corpo Hídrico`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 F01 0,09 91,78 -16,99 27,32 10,04 -15,62
2 F02 3,12 91,87 -33,16 27,32 2,13 -15,62
3 F03 8,22 100 -22,73 27,57 8,5 NaN
4 F04 5,92 100 -24,12 27,57 6,64 NaN
5 F05 1,95 91,5 -38 19,18 -29,35 -17,11
6 F06 4,7 99,38 -47,3 NaN -442,86 -25,93
Additionally, if you are wanting to simply change how you are seeing the data while printing, plotting, etc. for anything that is as.character, then you can change the default options. You can also read more about it here (this post has a lot of discussion directly related to your question).
options(OutDec= ",")
Examples (after changing options):
c(1.5, 3.456, 40000.89)
# [1] 1,500 3,456 40000,890
However, the caveat is that the data must be character. So with your data, we could convert those to character, then they will display with the comma rather than period.
df %>% mutate(across(everything(), as.character))
# A tibble: 6 × 7
# Groups: ponto [6]
ponto `Vegetação Nativa` Agricultura Pastagem `Área Urbana` `Solo Exposto` `Corpo Hídrico`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 F01 0,09 91,78 -16,99 27,32 10,04 -15,62
2 F02 3,12 91,87 -33,16 27,32 2,13 -15,62
3 F03 8,22 100 -22,73 27,57 8,5 NaN
4 F04 5,92 100 -24,12 27,57 6,64 NaN
5 F05 1,95 91,5 -38 19,18 -29,35 -17,11
6 F06 4,7 99,38 -47,3 NaN -442,86 -25,93

code is running fine line by line but fails when ran as a whole chunk in rmarkdown

When I run just this line of the code, the results are as expected. When I run the chunk, the mutations stop on the third line. How can I fix this, I feel like this is something new that I did not face before with the same code.
Sample data:
> dput(head(out))
structure(list(SectionCut = c("S-1", "S-1", "S-1", "S-1", "S-2",
"S-2"), OutputCase = c("LL-1", "LL-2", "LL-3", "LL-4", "LL-1",
"LL-2"), V2 = c(81.782, 119.251, 119.924, 96.282, 72.503, 109.595
), M3 = c("-29.292000000000002", "-32.661999999999999", "-30.904",
"-23.632999999999999", "29.619", "32.994"), id = c("./100-12-S01.xlsx",
"./100-12-S01.xlsx", "./100-12-S01.xlsx", "./100-12-S01.xlsx",
"./100-12-S01.xlsx", "./100-12-S01.xlsx")), row.names = c(NA,
-6L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
SectionCut = c("S-1", "S-1", "S-1", "S-1", "S-2", "S-2"),
OutputCase = c("LL-1", "LL-2", "LL-3", "LL-4", "LL-1", "LL-2"
), id = c("./100-12-S01.xlsx", "./100-12-S01.xlsx", "./100-12-S01.xlsx",
"./100-12-S01.xlsx", "./100-12-S01.xlsx", "./100-12-S01.xlsx"
), .rows = list(1L, 2L, 3L, 4L, 5L, 6L)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
> dput(head(Beamline_Shear))
structure(list(VLL = c(159.512186, 154.3336, 149.4451613, 167.0207595,
161.2269091, 156.4116505)), row.names = c("84-9", "84-12", "84-15",
"92-9", "92-12", "92-15"), class = "data.frame")
Code that I am trying to run:
Shear <- out[,-4] %>% mutate(N_l = str_extract(OutputCase,"\\d+"),
UG = str_extract(id,"\\d+"), a = str_extract(id,"-\\d+"),
S = str_extract(a,"\\d+"), Sections = paste0(UG,"-",S),
Sample = str_remove_all(id, "./\\d+-\\d+-|.xlsx")) %>%
left_join(Beamline_Shear %>% rownames_to_column("Sections"), by = "Sections") %>%
select(-OutputCase,-id,-Sections,-a)
There are some group attributes in the data, which should work normally, but can be an issue if we are running in a different env. Also, the mutate step and the join step doesn't really need any grouping attributes as they are fairly very straightforward rowwise operations that are vectorized.
library(dplyr)
out %>%
select(-4) %>%
ungroup %>% # // removes group attributes
mutate(N_l = str_extract(OutputCase,"\\d+"),
UG = str_extract(id,"\\d+"), a = str_extract(id,"-\\d+"),
S = str_extract(a,"\\d+"), Sections = paste0(UG,"-",S),
Sample = str_remove_all(id, "./\\d+-\\d+-|.xlsx")) %>% left_join(Beamline_Shear %>% rownames_to_column("Sections"), by = "Sections")
# A tibble: 6 x 11
# SectionCut OutputCase V2 id N_l UG a S Sections Sample VLL
# <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
#1 S-1 LL-1 81.8 ./100-12-S01.xlsx 1 100 -12 12 100-12 S01 NA
#2 S-1 LL-2 119. ./100-12-S01.xlsx 2 100 -12 12 100-12 S01 NA
#3 S-1 LL-3 120. ./100-12-S01.xlsx 3 100 -12 12 100-12 S01 NA
#4 S-1 LL-4 96.3 ./100-12-S01.xlsx 4 100 -12 12 100-12 S01 NA
#5 S-2 LL-1 72.5 ./100-12-S01.xlsx 1 100 -12 12 100-12 S01 NA
#6 S-2 LL-2 110. ./100-12-S01.xlsx 2 100 -12 12 100-12 S01 NA

Calculate year-to-year absolute change in R

Give a dataframe df as follows:
df <- structure(list(year = c(2001, 2002, 2003, 2004), `1` = c(22.0775,
24.2460714285714, 29.4039285714286, 27.7110714285714), `2` = c(27.2535714285714,
35.9996428571429, 26.39, 27.8557142857143), `3` = c(24.7710714285714,
25.4428571428571, 15.1142857142857, 19.9657142857143)), row.names = c(NA,
-4L), groups = structure(list(year = c(2001, 2002, 2003, 2004
), .rows = structure(list(1L, 2L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Out:
year 1 2 3
0 2001 22.07750 27.25357 24.77107
1 2002 24.24607 35.99964 25.44286
2 2003 29.40393 26.39000 15.11429
3 2004 27.71107 27.85571 19.96571
For column 1, 2 and 3, how could I calculate year-to-year absolute change?
The expected result will like this:
year 1 2 3
0 2002 2.16857 8.74607 0.67179
1 2003 5.15786 9.60964 10.32857
2 2004 1.69286 1.46571 4.85142
The final objective is to compare values of 1, 2, 3 columns across all years, find the largest change year and column, at this example, it should be 2003 and column 3.
How could I do that in R? Thanks.
You can use :
library(dplyr)
data <- df %>% ungroup %>% summarise(across(-1, ~abs(diff(.))))
data
# A tibble: 3 x 3
# `1` `2` `3`
# <dbl> <dbl> <dbl>
#1 2.17 8.75 0.672
#2 5.16 9.61 10.3
#3 1.69 1.47 4.85
To get max change
mat <- which(data == max(data), arr.ind = TRUE)
mat
# row col
#[1,] 2 3
#Year name
df$year[mat[, 1] + 1]
#[1] 2003
#Column name
mat[, 2]
#col
# 3
You can try:
library(reshape2)
library(dplyr)
#Melt
Melted <- reshape2::melt(df,id.vars = 'year')
#Group
Melted %>% group_by(variable) %>% mutate(Diff=c(0,abs(diff(value)))) %>% ungroup() %>%
filter(Diff==max(Diff))
# A tibble: 1 x 4
year variable value Diff
<dbl> <fct> <dbl> <dbl>
1 2003 3 15.1 10.3
We can apply the diff on the entire dataset by converting the numeric columns of interest to matrix in base R
cbind(year = df$year[-1], abs(diff(as.matrix(df[-1]))))
# year 1 2 3
#[1,] 2002 2.168571 8.746071 0.6717857
#[2,] 2003 5.157857 9.609643 10.3285714
#[3,] 2004 1.692857 1.465714 4.8514286

How to draw multiple lines in R under leaflet?

I am having trouble drawing multiple lines in R using leaflet. I have a base map of New York City stations. I would like to add more information from the existing data set. The data set has columns: start_lng, start_lat, end_lng end_lat and total_trip. For each row, I would like to draw a line that connects the start point and the end point separately. Then the two stations will be connect, which stands for a trip. I hope to have one trip for each row. Plus, for coloring, the darkness of the line segments will be based on the total_trip. How would I be able to do that? Thanks.
leaflet(sample) %>%
addTiles() %>%
setView(-73.9,40.7, zoom = 11) %>%
addCircles(data = master_stations,lng = ~long, lat = ~lat, weight = 1, popup = ~name)
Here's part of my data set:
start.station.id start.station.longitude start.station.latitude end.station.longitude end.station.latitude total_trip
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 72 -73.99393 40.76727 -74.00859 40.73620 2
2 72 -73.99393 40.76727 -73.99074 40.73455 2
3 72 -73.99393 40.76727 -73.97722 40.76341 2
4 72 -73.99393 40.76727 -73.98192 40.76527 2
5 79 -74.00667 40.71912 -73.98163 40.75206 2
6 79 -74.00667 40.71912 -73.98658 40.75514 2
7 79 -74.00667 40.71912 -73.98317 40.75527 2
8 79 -74.00667 40.71912 -73.98722 40.75300 2
9 83 -73.97632 40.68383 -73.97493 40.68981 4
10 83 -73.97632 40.68383 -73.98657 40.70149 2
# ... with 899 more rows
This is the full data set:
structure(list(start.station.id = c(72, 72, 72, 72, 79, 79),
end.station.id = c(238, 285, 352, 468, 153, 465), total_trip = c(2L,
2L, 2L, 2L, 2L, 2L), start.station.name = c("\"W 52 St & 11 Ave\"",
"\"W 52 St & 11 Ave\"", "\"W 52 St & 11 Ave\"", "\"W 52 St & 11 Ave\"",
"\"Franklin St & W Broadway\"", "\"Franklin St & W Broadway\""
), start.station.longitude = c(-73.99392888, -73.99392888,
-73.99392888, -73.99392888, -74.00666661, -74.00666661),
start.station.latitude = c(40.76727216, 40.76727216, 40.76727216,
40.76727216, 40.71911552, 40.71911552), end.station.name = c("\"Bank St & Washington St\"",
"\"Broadway & E 14 St\"", "\"W 56 St & 6 Ave\"", "\"Broadway & W 55 St\"",
"\"E 40 St & 5 Ave\"", "\"Broadway & W 41 St\""), end.station.longitude = c(-74.00859207,
-73.99074142, -73.97722479, -73.98192338, -73.9816324043,
-73.98658032), end.station.latitude = c(40.7361967, 40.73454567,
40.76340613, 40.7652654, 40.752062307, 40.75513557)), .Names = c("start.station.id",
"end.station.id", "total_trip", "start.station.name", "start.station.longitude",
"start.station.latitude", "end.station.name", "end.station.longitude",
"end.station.latitude"), row.names = c(NA, -6L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = list(start.station.id), drop = TRUE, indices = list(
0:3, 4:5), group_sizes = c(4L, 2L), biggest_group_size = 4L, labels = structure(list(
start.station.id = c(72, 79)), row.names = c(NA, -2L), class = "data.frame", vars = list(
start.station.id), drop = TRUE, .Names = "start.station.id"))

Resources