Putting string values in a column of lists - r

I have data that looks as follows (example data at the bottom):
# A tibble: 40 × 6
rn strata lower upper direction value
<chr> <list> <chr> <chr> <chr> <chr>
1 A <dbl [6]> 0 25 East 0 (replaced)
2 A <dbl [6]> 25 100 East 3 (replaced)
3 A <dbl [6]> 100 500 East 3
4 A <dbl [6]> 500 1000 East 4
5 A <dbl [6]> 1000 1000000 East 5
6 A <dbl [6]> 0 25 North 0 (replaced)
7 A <dbl [6]> 25 100 North 0 (replaced)
8 A <dbl [6]> 100 500 North 1
9 A <dbl [6]> 500 1000 North 28 (replaced)
10 A <dbl [6]> 1000 1000000 North 2
# … with 30 more rows
I would like to concatenate all value entries by rn, direction, upper. This can almost be done with the following code:
dat_in_new <- dat %>%
# One line for each rn-group
group_by(rn, upper, direction) %>%
# Calculate the sum, not taking into account replaced values
summarise(freq = sum(as.numeric(value), na.rm=TRUE), .groups = 'drop_last') %>%
group_modify(~add_row(.,freq = sum(.$value))) %>%
group_by(rn) %>%
summarise(freq = list(freq), .groups = "drop")
# A tibble: 2 × 2
rn freq
<chr> <list>
1 A c(0, 0, 0, 0, 0, 4, 0, 3, 0, 0, 5, 2, 9, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 0, 0)
2 B c(0, 0, 1, 0, 0, 13, 0, 2, 1, 0, 10, 3, 5, 0, 0, 1, 0, 1, 0, 0, 4, 0, 0, 1, 0)
This solution now has the correct sum, because the replaced values should not be added to the sum. However they should be added to the list. I have been trying to separate the two, but I cannot figure it out.
EDIT:
I thought it would maybe be possible to create another value column, say value_string, force value to numeric and keep value_string as strings, summarise both of them, get the sum from value and the values from value_string. But I can't figure out how to write the syntax.
Desired output:
# A tibble: 2 × 2
rn freq
<chr> <list>
1 A c("0 (replaced)", "0 (replaced)", ... )
2 B c("0 (replaced)", "0 (replaced)", ... )
Related questions:
Make a list out of frequencies, concatenating categories to that list
Using a column, with lists of values, to specify from which columns to create another list of values
DATA
library(dplyr)
library(tidyr)
dat <- structure(list(rn = c("A", "A", "A", "A",
"A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "B",
"B", "B", "B", "B",
"B", "B", "B", "B",
"B", "B", "B", "B",
"B", "B", "B", "B",
"B", "B", "B"), strata = list(
c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000,
1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500,
1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100,
500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0,
25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06
), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000,
1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500,
1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100,
500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0,
25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06
), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000,
1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500,
1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100,
500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0,
25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06
), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000,
1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500,
1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100,
500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0,
25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000, 1e+06
), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500, 1000,
1e+06), c(0, 25, 100, 500, 1000, 1e+06), c(0, 25, 100, 500,
1000, 1e+06)), lower = c("0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000"), upper = c("25", "100", "500",
"1000", "1000000", "25", "100", "500", "1000", "1000000", "25",
"100", "500", "1000", "1000000", "25", "100", "500", "1000",
"1000000", "25", "100", "500", "1000", "1000000", "25", "100",
"500", "1000", "1000000", "25", "100", "500", "1000", "1000000",
"25", "100", "500", "1000", "1000000"), direction = c("East",
"East", "East", "East", "East", "North", "North", "North", "North",
"North", "South", "South", "South", "South", "South", "West",
"West", "West", "West", "West", "East", "East", "East", "East",
"East", "North", "North", "North", "North", "North", "South",
"South", "South", "South", "South", "West", "West", "West", "West",
"West"), value = c("0 (replaced)", "3 (replaced)", "3", "4", "5",
"0 (replaced)", "0 (replaced)", "1", "28 (replaced)", "2", "0 (replaced)",
"2 (replaced)", "1", "3", "9", "0 (replaced)", "1 (replaced)", "9 (replaced)",
"8 (replaced)", "21 (replaced)", "1", "61 (replaced)", "4", "13", "10",
"2 (replaced)", "12 (replaced)", "48 (replaced)", "32 (replaced)", "3",
"1", "1", "76 (replaced)", "2", "5", "0 (replaced)", "4 (replaced)",
"1", "1", "15 (replaced)")), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame"))

Perhaps this helps
library(dplyr)
library(stringr)
out <- dat %>%
mutate(value_str = replace(value, str_detect(value, "^[0-9]+$"), NA_character_),
value = as.numeric(value)) %>%
group_by(rn, lower, upper) %>%
transmute(value = sum(value, na.rm = TRUE), value_str) %>%
group_by(rn, lower) %>%
group_modify(~add_row(., upper = "Sum", value = sum(.$value))) %>%
ungroup %>%
mutate(value = coalesce(value_str, as.character(value))) %>%
distinct(rn, lower, upper, value) %>%
group_by(rn) %>%
summarise(value = list(value))
-output
> out$value
[[1]]
[1] "0 (replaced)" "0" "5" "9 (replaced)" "20" "16" "21 (replaced)" "64" "3 (replaced)"
[10] "2 (replaced)" "1 (replaced)" "7" "28 (replaced)" "8 (replaced)" "28"
[[2]]
[1] "2" "2 (replaced)" "0 (replaced)" "8" "5" "48 (replaced)" "76 (replaced)" "20" "18"
[10] "15 (replaced)" "72" "61 (replaced)" "12 (replaced)" "1" "4 (replaced)" "4" "16" "32 (replaced)"
[19] "64"

I am not sure, but maybe you are looking for this:
What we do here is simple paste and collapse all!! the values after unnesting:
library(dplyr)
library(tidyr)
dat %>%
group_by(rn, upper,direction) %>%
summarise(freq = sum(as.numeric(value), na.rm=TRUE), .groups = 'drop_last') %>%
group_modify(~add_row(.,freq = sum(.$value))) %>%
group_by(rn) %>%
summarise(freq = list(freq), .groups = "drop") %>%
unnest() %>%
group_by(rn) %>%
mutate(freq = paste0(freq, " (replaced)", collapse = ", ")) %>%
slice(1)
rn freq
<chr> <chr>
1 A 0 (replaced), 0 (replaced), 0 (replaced), 0 (replaced), 0 (re~
2 B 0 (replaced), 0 (replaced), 1 (replaced), 0 (replaced), 0 (re~

I eventually figured it out, although it is far from the cleanest approach:
# Only sum values that are not replaced
dat$upper <- as.character(dat$upper)
dat <- dat %>%
group_by(rn, direction ) %>%
summarise(value = as.character(sum(as.numeric(value), na.rm=TRUE)), .groups = 'drop_last', upper="1000001", strata=strata) %>% # get sum of sizes
bind_rows(dat, .)
# Remove the duplicate rows
dat <- unique( dat )
# Convert upper back to numeric for sorting
dat$upper <- as.numeric(dat$upper)
dat <- dat %>%
arrange(rn, direction, upper)
# Create list
dat <- dat %>%
group_by(rn, strata) %>%
summarise(freq = list(value), .groups = 'drop')

Related

Unstructured txt file with similar pattern for all rows in R

I am currently working with a .txt file and have used the read_table2() function to read it, resulting in the following structure.
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
<chr> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 FVP110~ 2.08e6 1101~ 1.10e 3 6 0 0 0 6 01101 6 0 0 0 6 01101 6 0 0 0
2 FVP110~ 2.06e4 8 9.3 e 1 2 93 0 0 0 0 0 093 0 0 0 0 0 093 0 0
3 FVP110~ 2.10e6 6 9.3 e 1 2 93 0 0 0 0 0 093 0 0 0 0 0 093 0 0
4 FVP110~ 2.10e6 6 3.11e18 3111 8 0 0 0 8 03111 8 0 0 0 8 03111 8 0 0
5 FVP110~ 2.08e6 94 2 e 0 94 0 0 0 0 0 094 0 0 0 0 0 094 0 0 0
6 FVP110~ 2.06e4 6 9.2 e 1 2 92 0 0 0 0 0 092 0 0 0 0 0 092 0 0
# ... with 31 more variables: X21 <chr>, X22 <chr>, X23 <chr>, X24 <chr>, X25 <chr>, X26 <chr>, X27 <chr>, X28 <chr>,
# X29 <chr>, X30 <chr>, X31 <chr>, X32 <chr>, X33 <chr>, X34 <chr>, X35 <chr>, X36 <chr>, X37 <chr>, X38 <chr>, X39 <chr>,
# X40 <chr>, X41 <chr>, X42 <chr>, X43 <chr>, X44 <chr>, X45 <chr>, X46 <chr>, X47 <chr>, X48 <chr>, X49 <chr>, X50 <chr>,
# X51 <dbl>
I know that my first column, instead of being FVP1104Q1V110121011010110110527421101011165 is always a 4 chr 3 dbl 2chr 2chr 1dbl 2dbl etc. In total, there are 51 columns but if parsed correctly they will become a total of 129.
These are the first 10 rows and 10 columns of my data set.
structure(list(X1 = c("FVP1104Q1V110121011010110110527421101011165",
"FVP1104Q1V110121011010110110527421101022262", "FVP1104Q1V110121011010110110527421101033231",
"FVP1104Q1V110121011010110110527421101044134", "FVP1104Q1V110121011010110110527421102011165",
"FVP1104Q1V110121011010110110527421102022260", "FVP1104Q1V110121011010110110527421102033138",
"FVP1104Q1V110121011010110110527421102044232", "FVP1104Q1V11012101101011011052742110205616",
"FVP1104Q1V110121011010110110527421102063142"), X2 = c(2080110,
20601, 2100112, 2100112, 2080110, 20601, 2120115, 2100112, 10501,
40701), X3 = c("11011116112", "8", "6", "6", "94", "6", "6",
"6", "124", "8"), X4 = c(1101, 93, 93, 3111045932226084352, 2,
92, 3185102331226052608, 93, 91, 6), X5 = c(6, 2, 2, 3111, 94,
2, 3185, 2, 2, 11011216112), X6 = c(0, 93, 93, 8, 0, 92, 8, 93,
91, 1101), X7 = c("0", "0", "0", "0", "0", "0", "0", "0", "0",
"6"), X8 = c("0", "0", "0", "0", "0", "0", "0", "0", "0", "0"
), X9 = c("6", "0", "0", "0", "0", "0", "0", "0", "0", "0"),
X10 = c("01101", "0", "0", "8", "0", "0", "8", "0", "0",
"0"), X11 = c("6", "0", "0", "03111", "094", "0", "03185",
"0", "0", "6"), X12 = c("0", "093", "093", "8", "0", "092",
"8", "093", "091", "01101"), X13 = c("0", "0", "0", "0",
"0", "0", "0", "0", "0", "6"), X14 = c("0", "0", "0", "0",
"0", "0", "0", "0", "0", "0"), X15 = c("6", "0", "0", "0",
"0", "0", "0", "0", "0", "0")), row.names = c(NA, 10L), class = "data.frame")
And I want to get
structure(list(fileid = structure(c("FVP1", "FVP1", "FVP1", "FVP1",
"FVP1", "FVP1", "FVP1", "FVP1", "FVP1", "FVP1"), label = "File Identification", format.stata = "%9s"),
schedule = structure(c(104, 104, 104, 104, 104, 104, 104,
104, 104, 104), label = "Schedule", format.stata = "%8.0g"),
quarter = structure(c("Q3", "Q3", "Q3", "Q3", "Q3", "Q3",
"Q3", "Q3", "Q3", "Q3"), label = "Quarter", format.stata = "%9s"),
visit = structure(c("V1", "V1", "V1", "V1", "V1", "V1", "V1",
"V1", "V1", "V1"), label = "Visit", format.stata = "%9s"),
sector = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Sector", format.stata = "%8.0g"),
state = structure(c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36
), label = "State/Ut Code", format.stata = "%8.0g"), district = structure(c(10,
10, 10, 10, 10, 10, 10, 10, 10, 10), label = "District Code", format.stata = "%8.0g"),
region = structure(c(362, 362, 362, 362, 362, 362, 362, 362,
362, 362), label = "NSS-Region", format.stata = "%8.0g"),
stratum = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Stratum", format.stata = "%8.0g"),
substratum = structure(c(8, 8, 8, 8, 8, 8, 8, 8, 8, 8), label = "Sub-Stratum", format.stata = "%8.0g"),
subsample = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Sub-Sample", format.stata = "%8.0g"),
subregion = structure(c(3613, 3613, 3613, 3613, 3613, 3613,
3613, 3613, 3613, 3613), label = "Fod Sub-Region", format.stata = "%8.0g"),
fsu = structure(c(50030, 50030, 50030, 50030, 50030, 50030,
50030, 50030, 50030, 50030), label = "FSU", format.stata = "%10.0g"),
sbno = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Sample Sg/Sb No.", format.stata = "%8.0g"),
sss = structure(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2), label = "Second Stage Stratum No.", format.stata = "%8.0g")), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I'm trying to replicate reshaping of a .txt data using a dictionary .dct in Stata, but I don't find a clear way to do that in R.
My data also includes NA's
As per MrFlick's suggestion, we can use tidyr::separate to break apart your first column into multiple columns by position:
library(tidyr)
data.frame(X1 = "FVP1104Q1V110121011010110110527421101011165") %>%
separate(
X1,
sep = c(4, 7, 9, 11, 12),
into = paste0("X1_", 1:6)
)
# X1_1 X1_2 X1_3 X1_4 X1_5 X1_6
# 1 FVP1 104 Q1 V1 1 0121011010110110527421101011165

adjust the elements of a column to get a cumsum equal to zero

I have this columns in a bigger dataset (here i just report asset "x" but there are different, hence the idea is to replicate the process for every asset):
df <- structure(list(
asset = c("x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x")
col1 = c(10, 10, -22, 11, -13, 15, -7, -10, 10, -5, 3),
cumsum(col1) = c(10, 20, -2, 9, -4, 11, 4, -6, 4, -1, 2),
class = "data.frame", row.names = c(NA, -11L)
)
I want to correct the negative number in col1 such that the cumsum(col1) becomes equal to
cumsum(col1) = c(10, 20, 0, 11, 0, 15, 8, 0, 10, 5, 8)
To get that result I need to correct the col1 number iff the negative number is bigger than the cumsum of the previous number.
For example the -22 in third position should become -20 to match the cumsum of the previous 10+10
Then the -13 should become equal to -11 and the -10 should become -8, while the last three numbers shouldn't change since they do not cumsum to a negative outcome.
So at the end of the correction process I should get
col1 = c(10, 10, -20, 11, -11, 15, -7, -8, 10, -5, 3)
cumsum(col1) = c(10, 20, 0, 11, 0 ,15, 8, 0, 10, 5, 8)
In the process of correction I think that the mechanism should be (I don't know how to do it with R, but I get something in theoretical terms) :
group_by = each group in col1 should be defined by each col1(row) greater than the cumsum of its previous rows and restard whenever the col1(row) is greater than the previous elements cumsum
iff col1(row) is greater than the previous cumsum, correct the col1(row) with the group cumsum number with a negative sign in front
cumsum col1 and check again iff the result matches the desired output, hence there should be no negative cumsum values. The min should be equal to 0
in the original dataset I have multiple asset types, hence not only "x" but also "y", "z", and others. Furthermore I need to group_by investors since the same situation can be applied to 4k investors. hence the real dataset is something like this:
df <- structure(list(
investor = c("1", "1", "1", "2", "2", "2", "3", "3", "4", "4", "4"),
asset = c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "z")
col1 = c(10, 10, -22, 11, -13, 15, 9, -10, 10, -5, 3),
cumsum(col1) = c(10, 20, -2, 11, -2, 13, 9, -1, 10, 5, 3),
class = "data.frame", row.names = c(NA, -11L)
)
where i need it to become (the code should just take care of group_by(investor, asset))
df <- structure(list(
investor = c("1", "1", "1", "2", "2", "2", "3", "3", "4", "4", "4"),
asset = c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "z")
col1 = c(10, 10, -20, 11, -11, 15, 9, -9, 10, -5, 3),
cumsum(col1) = c(10, 20, 0, 11, 0, 15, 9, 0, 10, 5, 3),
class = "data.frame", row.names = c(NA, -11L)
)
I wrote thinking about a dplyr solution since I'm more confortable with that but I don't know if it is possibile to do in dplyr.
Thanks for the help!
We may do this with accumulate
library(dplyr)
library(purrr)
df %>%
group_by(asset) %>%
mutate(col2csum = accumulate(col1, ~ if(abs(.x + .y) < abs(.y)) 0 else
.x + .y)) %>%
ungroup
-output
# A tibble: 11 × 3
asset col1 col2csum
<chr> <dbl> <dbl>
1 x 10 10
2 x 10 20
3 x -22 0
4 x 11 11
5 x -13 0
6 x 15 15
7 x -7 8
8 x -10 0
9 x 10 10
10 x -5 5
11 x 3 8
Update
If we want to change the 'col1'
df %>%
group_by(asset) %>%
mutate(col2csum = accumulate(col1, ~ if(abs(.x + .y) < abs(.y)) 0 else
.x + .y), col1 = c(first(col2csum), diff(col2csum))) %>% ungroup
-output
# A tibble: 11 × 3
asset col1 col2csum
<chr> <dbl> <dbl>
1 x 10 10
2 x 10 20
3 x -20 0
4 x 11 11
5 x -11 0
6 x 15 15
7 x -7 8
8 x -8 0
9 x 10 10
10 x -5 5
11 x 3 8
data
df <- structure(list(asset = c("x", "x", "x", "x", "x", "x", "x", "x",
"x", "x", "x"), col1 = c(10, 10, -22, 11, -13, 15, -7, -10, 10,
-5, 3)), class = "data.frame", row.names = c(NA, -11L))

Capitalizing the first letter of characters in a column using substr function

I have this data frame
head(df)
## patnum hospstay lowph pltct race bwt gest inout twn lol magsulf
## 1 1 34 NA 100 white 1250 35 born at duke 0 NA NA
## 2 2 9 7.250000 244 white 1370 32 born at duke 0 NA NA
## 3 3 -2 7.059998 114 black 620 23 born at duke 0 NA NA
## 4 4 40 7.250000 182 black 1480 32 born at duke 0 NA NA
## 5 5 2 6.969997 54 black 925 28 born at duke 0 NA NA
## 6 6 62 7.189999 NA white 940 28 born at duke 0 NA NA
## meth toc delivery apg1 vent pneumo pda cld sex dead
## 1 0 0 abdominal 8 0 0 0 0 female 0
## 2 1 0 abdominal 7 0 0 0 0 female 0
## 3 0 1 vaginal 1 1 0 0 NA female 1
## 4 1 0 vaginal 8 0 0 0 0 male 0
## 5 0 0 abdominal 5 1 1 0 0 female 1
## 6 1 0 abdominal 8 1 0 0 0 female 0
The race variable has 4 entries, "white", "black", "native american", "oriental". I am wanting to replace this string with capitalized versions "White", "Black", "Native American", "Oriental". I would like to do this using the substr function. I'm not sure what code to use to accomplish this. I was provided an example below, where the
day_full = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
substr(day_full_1, nchar(day_full_1)-2, nchar(day_full_1)) = "DAY"
The result is: "SunDAY", "MonDAY", "TuesDAY", "WednesDAY", "ThursDAY", "FriDAY", "SaturDAY", "SunDAY"
This is similar to what I want to do, but I only want to have the first letter of each of the races to be capitalized. How would I translate this to make each first letter of the 4 races capital?
This is the solution I've tried now.
substr(SB_xlsx$race, 1, 1) <- toupper(substr(SB_xlsx$race, 1, 1))
substr(SB_xlsx$race, 1, 1)
## structure(list(patnum = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), hospstay = c(34,
## 9, -2, 40, 2, 62, 32, NA, NA, 28, 38, NA, 62, 69, 1, 93, 44,
## 50, 66, 65, 44, 70, 85, NA), lowph = c(NA, 7.25, 7.059998, 7.25,
## 6.969997, 7.189999, 7.32, NA, NA, 7.16, 7.039997, NA, 7.179996,
## 7.419998, 7.119999, 7.239998, 7.129997, 7.269997, 7.179996, 7.07,
## 7.289997, 7.129997, 7.189999, NA), pltct = c(100, 244, 114, 182,
## 54, NA, 282, NA, NA, 153, 229, NA, 182, 361, 378, 255, 186, NA,
## 260, 183, 134, 229, 68, NA), race = c("white", "white", "black",
## "black", "black", "white", "black", NA, NA, "black", "white",
## NA, "black", "white", "white", "black", "white", "black", "black",
## "white", "white", "black", "white", NA), bwt = c(1250, 1370,
## 620, 1480, 925, 940, 1255, 600, 700, 1350, 1310, 550, 1110, 1180,
## 970, 770, 1490, 1170, 1360, 1330, 1000, 1120, 740, NA), gest = c(35,
## 32, 23, 32, 28, 28, 29.5, 26, 24, 34, 32, 24, 28, 28, 28, 26,
## 33, 31, 31, 31, 28, 29, 26, NA), inout = c("born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", NA), twn = c(0, 0, 0, 0, 0, 0, 0, NA, NA, 0,
## 0, NA, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, NA), lol = c(NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA), meth = c(0, 1, 0, 1, 0, 1, 1, NA, NA, 1, 0, NA, 0, 0,
## 1, 1, 1, 1, 1, 1, 0, 1, 0, NA), toc = c(0, 0, 1, 0, 0, 0, 0,
## NA, NA, 0, 0, NA, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, NA), delivery = c("abdominal",
## "abdominal", "vaginal", "vaginal", "abdominal", "abdominal",
## "vaginal", NA, NA, "abdominal", "vaginal", NA, "vaginal", "abdominal",
## "vaginal", "vaginal", "abdominal", "vaginal", "vaginal", "vaginal",
## "vaginal", "vaginal", "abdominal", NA), apg1 = c(8, 7, 1, 8,
## 5, 8, 9, NA, NA, 4, 6, NA, 6, 6, 2, 4, 8, 7, 1, 8, 5, 9, 9, NA
## ), vent = c(0, 0, 1, 0, 1, 1, 0, NA, NA, 0, 1, NA, 0, 0, 1, 1,
## 0, 0, 1, 1, 0, 1, 0, NA), pneumo = c(0, 0, 0, 0, 1, 0, 0, NA,
## NA, 0, 0, NA, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, NA), pda = c(0,
## 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,
## 0, 0, NA), cld = c(0, 0, NA, 0, 0, 0, 0, NA, NA, 0, 0, NA, 1,
## 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, NA), sex = c("female", "female",
## "female", "male", "female", "female", "female", NA, NA, "female",
## "male", NA, "male", "male", "female", "male", "male", "female",
## "male", "male", "female", "female", "female", NA), dead = c(0,
## 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
## 0, NA)), class = "data.frame", row.names = c(NA, -24L))
Two solutions:
df <- structure(list(patnum = 1:6, hospstay = c(34L, 9L, -2L, 40L, 2L, 62L), lowph = c(NA, 7.25, 7.059998, 7.25, 6.969997, 7.189999), pltct = c(100L, 244L, 114L, 182L, 54L, NA), race = c("white", "white", "black", "black", "black", "white"), bwt = c(1250L, 1370L, 620L, 1480L, 925L, 940L), gest = c(35L, 32L, 23L, 32L, 28L, 28L), inout = c("born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke"), twn = c(0L, 0L, 0L, 0L, 0L, 0L), lol = c(NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
tools::toTitleCase(df$race)
# [1] "White" "White" "Black" "Black" "Black" "White"
But those are simpler with no spaces, let's create one for this exercise:
vec <- c("white", "black", "native american")
tools::toTitleCase(vec)
# [1] "White" "Black" "Native American"
We can also use gregexpr/regmatches to do it:
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre)
# [[1]]
# [1] "w"
# [[2]]
# [1] "b"
# [[3]]
# [1] "n" "a"
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
# [1] "White" "Black" "Native American"
I'm sure there's a stringr-variant out there as well.
As for substr, it's feasible to use regex to find all (1) first-chars and (2) all chars that follow a space, then extract each one, then toupper-them, then put that back into place ... but at that point you're still using regex and effectively doing what toTitleCase is doing natively and what this gregexpr/regmatches code is doing a little more verbosely.
If all you wanted to do was replace the first character, though, and not care about letters after spaces, then
substr(vec, 1, 1) <- toupper(substr(vec, 1, 1))
vec
# [1] "White" "Black" "Native american"
though in this case, I think the lower-case "a" in "Native american" is wrong, so I don't think this is the best approach.
Scaling
Since you are concerned about scaling (I'm assuming you're venturing into 100K or more, since less than that is not going to be an issue with any method demonstrated), here's a comparison:
bench::mark(
toTitleCase = tools::toTitleCase(vec),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
}
)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 401us 474us 1735. 4.15KB 0 868 0 500ms <chr [3]> <Rprofmem [9 x 3]> <bench_~ <tibble~
# 2 gregexpr 111us 205us 5240. 24.28KB 2.26 2315 1 442ms <chr [3]> <Rprofmem [6 x 3]> <bench_~ <tibble~
Granted, vec size 3 is pretty small, let's scale that up a bit.
vec30000 <- rep(vec, 10000) # 30000 length
bench::mark(
toTitleCase = tools::toTitleCase(vec30000),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000
}
)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 6.01s 6.01s 0.166 36MB 0.832 1 5 6.01s <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
# 2 gregexpr 773.13ms 773.13ms 1.29 241MB 2.59 1 2 773.13ms <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
Looking at the `itr/sec` column showing iterations per second, it appears that even at scale, the gregexpr method works better. (If you look at the source code for toTitleCase, you'll see why: it's consider a lot more than just space-delimited words, it's also consider linking words, exception-words, etc.)
Another way is to use perl substitution:
gsub('\\b(\\w)', '\\U\\1', vec, perl = TRUE)
[1] "White" "Black" "Native American"
This method is way faster (ie 35+ times Faster) than the gregexpr method mentioned before:
microbenchmark::microbenchmark(
gsub = gsub('\\b(\\w)', '\\U\\1', vec30000, perl = TRUE),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000 },
unit = 'relative', check = 'equal')
Unit: relative
expr min lq mean median uq max neval
gsub 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 5
gregexpr 37.37549 41.10014 29.00345 24.49221 25.39978 25.54325 5

ggplot: Multi-panel/facet scatter plots separated by multiple variables (AND NOT by multiple categories within a variable) using one common y-axis

My dataframe loopsubset_created contains 30 observations of 45 variables. (Below you will find str(loopsubset_created) and a dput(loopsubset_created) sample).
Now I would like to create scatter plots of the PdKeyT-Variable (y) versus five of the band-value-variables (BLUE, GREEN, RED, SWIR1, SWIR2) (x) with
each variable in ONE panel
all panels aligned in ONE row
using the PdKeyTvariable as common y-axis.
In the end it should basically look like this: (I did this with ggscatter, but for flexibility reasons I would prefer basically using ggplot)
Here now my issue:When trying with ggplot, I do not find the right way for the above showed arrangement, as I cannot figure out the right code for separating/grouping by variables. I found hundreds of tutorials for facetting by multiple categorial values within a variable, but not by multiple variables.
With the following code
ggplot(loopsubset_created, aes(y = PdKeyT)) +
geom_point(aes(x = BLUE, col = "BLUE")) +
geom_point(aes(x = GREEN, col = "GREEN")) +
geom_point(aes(x = RED, col = "RED")) +
geom_point(aes(x = SWIR1, col = "SWIR1")) +
geom_point(aes(x = SWIR2, col = "SWIR2"))
I came to this basic result
Here the basic question:
Now, I would like to arrange the 5 layers seperately in one row according to the above depicted way
Anybody an idea for me?
Plus some information around the question:
Though the following aspects are not directly part of my question, I'd like to describe my final idea of the plot (in order to avoid that your advices may clash with further requirements):
Each panel should include
Spearman corr value and according p-value (as shown above) and
additionaly Pearson corr value and according p-value
Linear regression with conf. interval (as shown above) or other type of regression line (not shown)
Points should be couloured by variable (BLUE=bLue, RED= red; GREEN=green, SWIR1+2 by some other coulours, e.g. magenta and violet)
later on points and regressionlines should be subdived by ranges of PdKeyT (e.g. below -10, -10-to 30, and above 30) with using differnt brightness values of variable basic colours (blue, green, ...), analogouos to this:
All panels should use ONE common y-axis at the left as explained
And I would like to adpat the x-axes by the range of the respecitve variable (e.g. range for BLUE, GREEN and RED from 500 to 3000 and the SWIRs from 0 to 1500
edit 31.10.2021 referring to your answers:
Would it furtheron be possible with your respective approaches to limit the x-axes individually as depicted in the 'further requirements' of my question (B-G-R ranging from 500 to 3000, SWIRs from 0 to 1500) with using coord_cartesian(xlim = c(min,max))? I am asking because I read some discussions with issues on limiting axes depending on the 'faceting approach'. But I'd like to control the x-axes, because I'll have many of these plots stacked on top of each other (My sample mirrored the data of just one sampling point out of 300). And i would be glad if getting them aligned.
I'd meanwhile prefer to discrete points and reglines just by gray scale colors (for all bands the same) and rather discretely coloring the panels by theme(panel.background = element_rect(fill = "#xxxxxx"). Do you see an issue with that?
Finally some information and sample of my data
> str(loopsubset_created)
'data.frame': 30 obs. of 45 variables:
$ Site_ID : chr "A" "A" "A" "A" ...
$ Spot_Nr : chr "1" "1" "1" "1" ...
$ Transkt_Nr : chr "2" "2" "2" "2" ...
$ Point_Nr : chr "4" "4" "4" "4" ...
$ n : int 30 30 30 30 30 30 30 30 30 30 ...
$ rank : int 3 3 3 3 3 3 3 3 3 3 ...
$ Tile : chr "1008" "1008" "1008" "1008" ...
$ Date : int 20190208 20190213 20190215 20190218 20190223 20190228 20190302 20190305 20190315 20190320 ...
$ id : chr "22" "22" "22" "22" ...
$ Point_ID : chr "1022" "1022" "1022" "1022" ...
$ Site_Nr : chr "1" "1" "1" "1" ...
$ Point_x : num 356251 356251 356251 356251 356251 ...
$ Point_y : num 5132881 5132881 5132881 5132881 5132881 ...
$ Classification : num 7 7 7 7 7 7 7 7 7 7 ...
$ Class_Derived : chr "WW" "WW" "WW" "WW" ...
$ BLUE : num 1112 1095 944 1144 1141 ...
$ GREEN : num 1158 1178 1009 1288 1265 ...
$ RED : num 599 708 613 788 835 ...
$ REDEDGE1 : num 359 520 433 576 665 761 618 598 881 619 ...
$ REDEDGE2 : num 83 82 65 169 247 404 116 118 532 162 ...
$ REDEDGE3 : num 73 116 81 142 233 391 56 171 538 131 ...
$ BROADNIR : num 44 93 60 123 262 349 74 113 560 125 ...
$ NIR : num 37 70 66 135 215 313 110 135 504 78 ...
$ SWIR1 : num 187 282 184 225 356 251 240 216 507 197 ...
$ SWIR2 : num 142 187 155 197 281 209 192 146 341 143 ...
$ Quality.assurance.information: num 26664 10272 10272 10272 8224 ...
$ Q00_VAL : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q01_CS1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q02_CSS : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q03_CSH : num 1 0 0 0 0 0 0 0 1 0 ...
$ Q04_SNO : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q05_WAT : num 1 1 1 1 1 1 1 1 1 1 ...
$ Q06_AR1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q07_AR2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q08_SBZ : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q09_SAT : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q10_ZEN : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q11_IL1 : num 1 1 1 1 0 0 0 0 0 0 ...
$ Q12_IL2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q13_SLO : num 1 1 1 1 1 1 1 1 1 1 ...
$ Q14_VAP : num 1 0 0 0 0 0 0 0 1 0 ...
$ Q15_WDC : num 0 0 0 0 0 0 0 0 0 0 ...
$ PdMax : int -7 -19 -20 -22 -24 -25 -26 -25 -21 -15 ...
$ PdMin : int -13 -23 -24 -26 -28 -29 -29 -28 -24 -20 ...
$ PdKeyT : int -10 -20 -22 -22 -27 -26 -26 -27 -22 -17 ...
loopsubset_created <- structure(list(Site_ID = c("A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A"), Spot_Nr = c("1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1"), Transkt_Nr = c("2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"), Point_Nr = c("4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4"), n = c(30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L), rank = c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), Tile = c("1008",
"1008", "1008", "1008", "1008", "1008", "1008", "1008", "1008",
"1008", "1008", "1008", "1008", "1008", "1008", "1008", "1008",
"1008", "1008", "1008", "1008", "1008", "1008", "1008", "1008",
"1008", "1008", "1008", "1008", "1008"), Date = c(20190208L,
20190213L, 20190215L, 20190218L, 20190223L, 20190228L, 20190302L,
20190305L, 20190315L, 20190320L, 20190322L, 20190325L, 20190330L,
20190401L, 20190416L, 20190419L, 20190421L, 20190501L, 20190506L,
20190524L, 20190531L, 20190603L, 20190620L, 20190625L, 20190630L,
20190705L, 20190710L, 20190809L, 20190814L, 20190903L), id = c("22",
"22", "22", "22", "22", "22", "22", "22", "22", "22", "22", "22",
"22", "22", "22", "22", "22", "22", "22", "22", "22", "22", "22",
"22", "22", "22", "22", "22", "22", "22"), Point_ID = c("1022",
"1022", "1022", "1022", "1022", "1022", "1022", "1022", "1022",
"1022", "1022", "1022", "1022", "1022", "1022", "1022", "1022",
"1022", "1022", "1022", "1022", "1022", "1022", "1022", "1022",
"1022", "1022", "1022", "1022", "1022"), Site_Nr = c("1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1"), Point_x = c(356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781), Point_y = c(5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701), Classification = c(7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7), Class_Derived = c("WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW"), BLUE = c(1112, 1095, 944, 1144,
1141, 1010, 968, 1023, 1281, 1124, 1215, 1154, 1188, 1177, 1622,
1305, 1215, 2282, 2322, 2337, 2680, 2473, 1143, 1187, 1165, 1040,
1290, 1112, 1474, 1131), GREEN = c(1158, 1178, 1009, 1288, 1265,
1208, 1122, 1146, 1416, 1298, 1379, 1345, 1379, 1366, 1714, 1446,
1354, 2417, 2417, 2500, 2967, 2587, 1469, 1522, 1544, 1253, 1514,
1371, 1875, 1416), RED = c(599, 708, 613, 788, 835, 852, 726,
729, 1044, 816, 905, 908, 948, 970, 1206, 944, 935, 1648, 1741,
2004, 2109, 2032, 1241, 1290, 1419, 1206, 1424, 1339, 1969, 1321
), REDEDGE1 = c(359, 520, 433, 576, 665, 761, 618, 598, 881,
619, 722, 771, 829, 823, 937, 725, 759, 1327, 1395, 1756, 1718,
1753, 1533, 1528, 1683, 1335, 1605, 1499, 2016, 1592), REDEDGE2 = c(83,
82, 65, 169, 247, 404, 116, 118, 532, 162, 183, 218, 285, 200,
514, 182, 230, 568, 531, 1170, 780, 1101, 1192, 1174, 1250, 949,
1121, 1127, 1382, 1159), REDEDGE3 = c(73, 116, 81, 142, 233,
391, 56, 171, 538, 131, 205, 137, 321, 253, 503, 193, 214, 564,
527, 1192, 698, 1177, 1203, 1259, 1341, 1049, 1146, 1216, 1416,
1188), BROADNIR = c(44, 93, 60, 123, 262, 349, 74, 113, 560,
125, 121, 211, 325, 221, 480, 184, 178, 461, 435, 1067, 570,
1023, 961, 966, 964, 844, 764, 993, 1197, 834), NIR = c(37, 70,
66, 135, 215, 313, 110, 135, 504, 78, 115, 216, 197, 163, 462,
113, 165, 392, 349, 1006, 574, 1092, 1153, 1143, 1128, 961, 1033,
1027, 1164, 1086), SWIR1 = c(187, 282, 184, 225, 356, 251, 240,
216, 507, 197, 306, 260, 298, 290, 400, 190, 300, 275, 204, 678,
528, 1087, 1091, 1049, 1310, 935, 1199, 1169, 984, 1139), SWIR2 = c(142,
187, 155, 197, 281, 209, 192, 146, 341, 143, 271, 220, 246, 232,
387, 168, 217, 193, 173, 540, 374, 764, 766, 799, 869, 724, 827,
794, 745, 848), Quality.assurance.information = c(26664, 10272,
10272, 10272, 8224, 8224, 8224, 8224, 24616, 8224, 8224, 8224,
32, 8224, 8288, 24616, 8224, 8240, 48, 8208, 8240, 8192, 8192,
24648, 8192, 8192, 8192, 8192, 0, 8224), Q00_VAL = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q01_CS1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Q02_CSS = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q03_CSH = c(1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q04_SNO = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q05_WAT = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1), Q06_AR1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q07_AR2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q08_SBZ = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q09_SAT = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), Q10_ZEN = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q11_IL1 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q12_IL2 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q13_SLO = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1), Q14_VAP = c(1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q15_WDC = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), PdMax = c(-7L, -19L, -20L,
-22L, -24L, -25L, -26L, -25L, -21L, -15L, -19L, -17L, -23L,
-22L, -4L, -7L, -8L, 55L, 57L, 47L, 67L, 44L, 21L, 18L, 13L,
16L, 16L, 9L, 12L, 11L), PdMin = c(-13L, -23L, -24L, -26L,
-28L, -29L, -29L, -28L, -24L, -20L, -22L, -22L, -26L, -26L,
-7L, -11L, -11L, 46L, 47L, 36L, 52L, 37L, 17L, 14L, 9L, 11L,
9L, 5L, 5L, 2L), PdKeyT = c(-10L, -20L, -22L, -22L, -27L,
-26L, -26L, -27L, -22L, -17L, -19L, -19L, -23L, -23L, -5L,
-9L, -9L, 54L, 53L, 40L, 60L, 43L, 20L, 15L, 13L, 15L, 13L,
7L, 9L, 6L)), row.names = 198:227, class = "data.frame")
Update:
To fulfill your last task me could make use of the code that is from Allan Cameron: adding another column to set the cuts mutate(range = cut(PdKeyT, c(-Inf, -10, 30, Inf), c("Low", "Mid", "High"))) %>% (this code was provided by Allan Cameron)
library(tidyverse)
library(ggpubr)
df_long_list <- loopsubset_created %>%
select(PdKeyT, BLUE, GREEN, RED, SWIR1, SWIR2) %>%
pivot_longer(
cols = -PdKeyT
) %>%
mutate(color = case_when(name=="BLUE" ~ "blue",
name=="GREEN" ~ "green",
name=="RED" ~ "red",
name=="SWIR1" ~ "magenta",
name=="SWIR2" ~ "violet"))%>%
mutate(range = cut(PdKeyT, c(-Inf, -10, 30, Inf), c("Low", "Mid", "High"))) %>%
group_split(name)
p <- ggplot()
for (i in 1:5) p <- p + geom_point(data=df_long_list[[i]], aes(value, PdKeyT, color=color, alpha=range))+
geom_smooth(data=df_long_list[[i]], aes(value, PdKeyT, group=range), method = lm, se=TRUE)+
theme(legend.position="none") +
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Spearman",..r.label.., ..p.label.., sep = "~`,`~")), method="spearman",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 70)+
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Pearson",..r.label.., ..p.label.., sep = "~`,`~")), method="pearson",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 65)+
facet_grid(.~name, scales = "free") +
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(120, 10, 120, 10),
panel.border = element_rect(fill = NA, color = "black"))
p
Here is how you could do it:
select all relevant columns
bring in long format
add color column to dataframe
make a list of dataframes with group_split
use a for loop to iterate over each of the 5 dataframes in the list
within the loop add stat_cor for pearson and spearman from ggpubr package
facet and do some formatting
library(tidyverse)
library(ggpubr)
df_long_list <- loopsubset_created %>%
select(PdKeyT, BLUE, GREEN, RED, SWIR1, SWIR2) %>%
pivot_longer(
cols = -PdKeyT
) %>%
mutate(color = case_when(name=="BLUE" ~ "blue",
name=="GREEN" ~ "green",
name=="RED" ~ "red",
name=="SWIR1" ~ "magenta",
name=="SWIR2" ~ "violet"))%>%
group_split(name)
p <- ggplot()
for (i in 1:5) p <- p + geom_point(data=df_long_list[[i]], aes(value, PdKeyT, color=color))+
geom_smooth(data=df_long_list[[i]], aes(value, PdKeyT), method = lm, se=TRUE)+
theme(legend.position="none") +
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Spearman",..r.label.., ..p.label.., sep = "~`,`~")), method="spearman",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 70)+
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Pearson",..r.label.., ..p.label.., sep = "~`,`~")), method="pearson",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 65)+
facet_grid(.~name, scales = "free_y") +
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(120, 10, 120, 10),
panel.border = element_rect(fill = NA, color = "black"))
p
To panel plots use facet_wrap or facet_grid. Also, generally ggplot2 works better when your data are in a long format. This allows you to assign a variable to an aesthetic rather than do it manually as you have.
library(ggplot2)
library(tidyr)
library(purrr)
library(dplyr)
library(tibble)
# lengthen your data so variable names are in a column
df <- loopsubset_created %>%
pivot_longer(cols = c(BLUE:RED, starts_with("SWIR")))
# get correlation coef and pvalue
r <- map(split(df, ~ name), ~ with(.x, c(cor(PdKeyT, value, method = "spearman"),
cor.test(PdKeyT, value, method = "spearman")$p.value))) %>%
bind_rows() %>%
rownames_to_column("i") %>% # first row is coef, second row is p value
pivot_longer(-i) %>%
mutate(lab = ifelse(i == 1,
# formatted so will be parsed by geom_text
sprintf("italic(R) == %0.5f", value),
sprintf("italic(p) == %0.5f", value)),
x = -Inf, # left of panel
y = Inf, # top of panel,
vjust = ifelse(i == 1, 0.75, 2)) # put p-value below
df %>%
ggplot(aes(x = value, y = PdKeyT, color = name)) +
geom_point() +
geom_text(data = r,
aes(x = x, y = y,
label = lab,
vjust = vjust),
size = 3,
parse = T,
inherit.aes = F) +
geom_smooth(method = "lm",
se = T,
formula = y ~ x,
show.legend = F) +
facet_grid(~ name,
scales = "free_x") +
labs(color = element_blank(),
x = "XLAB")
I think this fulfills most of your requirements, other than the correlation annotations. If, as you mention in your question, you wish to have 3 regressions per panel (one for each of the three ranges of PdkeyT) you would also need 3 correlation coefficients and p values per panel, which will be messy.
The reason why you have not seen tutorials for having different facets per variable is that this is not what facets are. Facets are a way of displaying data that have the same x and y axis but differ by some other categorical variable. They are not intended as a way of plotting different x variables against the same y variable. What you are describing is 5 distinct plots side-by-side, not facets.
Having said that, it is still possible to create the plot you are looking for with creative use of facets. You first need to shape the data into long format so that the values of the different x axis columns get stacked into a single column called value, and a new column called name is created to label each value according to which column it originally came from.
We can then use the new value column as our x axis variable, and facet according to the name column.
To make this look more authentic, we make some theme adjustments to ensure the facet strips resemble axis labels:
library(dplyr)
library(tidyr)
library(ggplot2)
loopsubset_created %>%
select(PdKeyT, BLUE, GREEN, RED, SWIR1, SWIR2) %>%
pivot_longer(-1) %>%
mutate(range = cut(PdKeyT, c(-Inf, -10, 30, Inf), c("Low", "Mid", "High"))) %>%
ggplot(aes(value, PdKeyT, color = name)) +
geom_point(aes(alpha = range)) +
geom_smooth(aes(group = range), size = 0.1,
method = "lm", formula = y ~ x, color = "black") +
labs(x = "") +
facet_grid(.~name, switch = "x", scales = "free_x") +
scale_color_manual(values = c("blue", "green", "red", "magenta", "violet")) +
theme_bw() +
theme(strip.placement = "outside",
strip.background = element_blank(),
plot.margin = margin(120, 10, 120, 10),
legend.position = "none")

Replacing NAs in columns with values from rows in a different dataframe in R that have the same ID

I have two dataframes:
deploy.info <- data.frame(Echo_ID = c("20180918_7.5Fa_1", "20180918_Sebre_3", "20190808_Bake_2", "20190808_NH_2"),
uppermost_bin = c(2, 7, 8, 12))
spc <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, "NA", "NA", 1, "NA"),
'20180918_Sebre_3' = c(1, 2, "NA", "NA", "NA", 4, "NA"),
'20190808_Bake_2' = c(1, 3, 7, "NA", "NA", 6, "NA"),
'20190808_NH_2' = c(1, 2, 8, "NA", "NA", 6, "NA"))
The last four columns in the spc data frame refer to each Echo_ID that I am dealing with in the deploy.info data frame. I want to replace the NAs in the spc data frame with the uppermost_bin values for each of the Echo_IDs. Does anyone know how to go about doing this?
My desired end product would look like:
i.want.this <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, 2, 2, 1, 2),
'20180918_Sebre_3' = c(1, 2, 7, 7, 7, 4, 7),
'20190808_Bake_2' = c(1, 3, 7, 8, 8, 6, 8),
'20190808_NH_2' = c(1, 2, 8, 12, 12, 6, 12))
I have over 100 columns like this and would rather not go in and have to do this change by hand. Any ideas are greatly appreciated.
We can use Map to replace the NA elements in the columns of 'Echo_ID' by the corresponding values of 'uppermost_bin'. In the OP's dataset, the columns were factor, so it was converted to the correct type with type.convert
nm1 <- paste0("X", deploy.info$Echo_ID)
spc <- type.convert(spc, as.is = TRUE)
spc[nm1] <- Map(function(x, y) replace(x, is.na(x), y),
spc[nm1], deploy.info$uppermost_bin)
spc
# species percent_dist percent_dist_from_surf X20180918_7.5Fa_1 X20180918_Sebre_3 X20190808_Bake_2 X20190808_NH_2
#1 RS 0 0 1 1 1 1
#2 GS 25 25 1 2 3 2
#3 YG 80 80 1 7 7 8
#4 RR 100 100 2 7 8 12
#5 BR 98 98 2 7 8 12
#6 GT 60 60 1 4 6 6
#7 CB 100 100 2 7 8 12

Resources