Related
I am struggling to write an R code for the following problem:
df1 and df2 are two dataframes.
> df1 <- read.csv(file = 'Indx.csv')
> df1
St_Name I1 I2 I3 I4
1 TN 0.10 0.15 0.20 0.25
2 AZ 0.30 0.35 0.40 0.45
3 TX 0.50 0.55 0.60 0.65
4 KS 0.70 0.75 0.80 0.85
5 KY 0.90 0.95 0.11 0.12
6 MN 0.13 0.14 0.16 0.17
> df2 <- as.data.frame(fromJSON(file = "NewIndx.json"))
> df2
St_Name I1 I3
1 KS 100 200
# The output should be
> df1
St_Name I1 I2 I3 I4
1 TN 0.10 0.15 0.20 0.25
2 AZ 0.30 0.35 0.40 0.45
3 TX 0.50 0.55 0.60 0.65
4 KS 100 0.75 200 0.85
5 KY 0.90 0.95 0.11 0.12
6 MN 0.13 0.14 0.16 0.17
>
what is the optimal code to achieve this?
We could use this slightly modified function coalesce_join provided by Edward Visel:
library(tidyverse)
# the function:
coalesce_join <- function(x, y,
by = NULL, suffix = c(".y", ".x"),
join = dplyr::full_join, ...) {
joined <- join(y, x, by = by, suffix = suffix, ...)
# names of desired output
cols <- union(names(y), names(x))
to_coalesce <- names(joined)[!names(joined) %in% cols]
suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
# remove suffixes and deduplicate
to_coalesce <- unique(substr(
to_coalesce,
1,
nchar(to_coalesce) - nchar(suffix_used)
))
coalesced <- purrr::map_dfc(to_coalesce, ~dplyr::coalesce(
joined[[paste0(.x, suffix[1])]],
joined[[paste0(.x, suffix[2])]]
))
names(coalesced) <- to_coalesce
dplyr::bind_cols(joined, coalesced)[cols]
}
# apply
coalesce_join(df1, df2, by = 'St_Name')
St_Name I1 I3 I2 I4
1 KS 100.00 200.00 0.75 0.85
2 TN 0.10 0.20 0.15 0.25
3 AZ 0.30 0.40 0.35 0.45
4 TX 0.50 0.60 0.55 0.65
5 KY 0.90 0.11 0.95 0.12
6 MN 0.13 0.16 0.14 0.17
Kindly let me know if this is what you were anticipating.
library(tidyr)
id<- "St_Name"
df_1<- melt(df_1, id.vars = id, measure.vars = setdiff(colnames(df_1),id))
df_2 <- melt(df_2, id.vars = id, measure.vars = setdiff(colnames(df_2),id))
result <- merge(df_1,df_2, by=c("St_Name","variable"),no.dups = TRUE,all.x = TRUE)
result$value.x[which(!is.na(result$value.y))]<- result$value.y[which(!is.na(result$value.y))]
result <- result[,-4]
result <-spread(result, variable, value.x)
We could use {powerjoin} and use the conflict argument. coalesce_xy will pick in priority data from the right side table.
data
df1 <- tibble::tribble(
~St_Name, ~I1, ~I2, ~I3, ~I4,
"TN", 0.10, 0.15, 0.20, 0.25,
"AZ", 0.30, 0.35, 0.40, 0.45,
"TX", 0.50, 0.55, 0.60, 0.65,
"KS", 0.70, 0.75, 0.80, 0.85,
"KY", 0.90, 0.95, 0.11, 0.12,
"MN", 0.13, 0.14, 0.16, 0.17)
df2 <- tibble::tribble(
~St_Name, ~I1, ~I3,
"KS", 100, 200
)
solution
library(powerjoin)
power_left_join(df1, df2, by = "St_Name", conflict = coalesce_yx)
#> # A tibble: 6 × 5
#> St_Name I2 I4 I1 I3
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 TN 0.15 0.25 0.1 0.2
#> 2 AZ 0.35 0.45 0.3 0.4
#> 3 TX 0.55 0.65 0.5 0.6
#> 4 KS 0.75 0.85 100 200
#> 5 KY 0.95 0.12 0.9 0.11
#> 6 MN 0.14 0.17 0.13 0.16
I have this data frame (df) and I need to get a vector of the values in the column "amountN" when the "ID" column matches the values of a given vector (idvector), so I've created a for loop that I tested by printing the values of df$amountN to check they actually are accomplishing the df$ID condition, and they do. This is the for loop:
df
amountS amountN ID
64693 0.440 0.55 028
64702 0.360 0.52 028
64708 0.220 0.33 028
64714 0.500 0.27 028
64720 0.280 0.51 028
64726 0.520 0.47 028
64732 0.410 0.25 028
64735 0.090 0.11 028
64741 0.220 0.17 028
64750 0.630 0.48 028
64756 0.430 0.35 028
64762 1.200 0.40 028
65150 4.425 14.95 029
65156 5.035 23.60 029
65163 5.810 26.20 029
idvector <- c("010","025","028")
for(i in seq_len(nrow(df))){
for (j in seq_len(length(idvector))){
if(df$ID[i] == idvector[j]){
print(df$amountN[i])
}
}
}
So far, so good. However, I've tried many things to get those values into a vector by changing the function print() in the last line, but nothing seems to work:
for(i in seq_len(nrow(df))){
for (j in seq_len(length(idvector))){
if(df$ID[i] == idvector[j]){
x <- c(df$amountN[i])
}
x
}
}
I also tried to get the subset of df$amounN to then convert the data to vector using subset and select from dplyr library, but also don't get that, instead I get NULL:
for(i in seq_len(nrow(df))){
for (j in seq_len(length(idvector))){
if(df$ID[i] == idvector[j]){
x <- subset(df$amountN[i,])
}
x
}
}
I looked into the data and notice that the values look like this:
[1] 0.55
[1] 0.52
[1] 0.33
[1] 0.27
[1] 0.51
[1] 0.47
[1] 0.25
[1] 0.11
[1] 0.17
[1] 0.48
[1] 0.35
[1] 0.4
And they should look like this:
[1] 0.55 0.52 0.33 0.27 0.51 0.47 0.25 0.11 0.17 0.48
[11] 0.35 0.40
I really need a vector or a subset so I can apply summary statistics and others to the data, but have not figure it out.
I'm using R version 4.0.3
In base R, we can do this more easily with subset
subset(df, ID %in% idvector, select = amountN)$amountN
#[1] 0.55 0.52 0.33 0.27 0.51 0.47 0.25 0.11 0.17 0.48 0.35 0.40
Regarding the OP's code, we can fix by defining 'x' as a NULL vector and then concatenate 'x' within each loop, and assign it back to 'x'. Also, make sure that the 'idvector' is of the same type i.e. assuming it is numeric as well
x <- c()
for(i in seq_len(nrow(df))){
for (j in seq_len(length(idvector))){
if(df$ID[i] == idvector[j]){
x <- c(x, df$amountN[i])
}
x
}
}
-output
x
#[1] 0.55 0.52 0.33 0.27 0.51 0.47 0.25 0.11 0.17 0.48 0.35 0.40
NOTE: The issue in the OP's code was that 'x' was getting updated on each iteration while removing the previous output. It needs concatenation
data
df <- structure(list(amountS = c(0.44, 0.36, 0.22, 0.5, 0.28, 0.52,
0.41, 0.09, 0.22, 0.63, 0.43, 1.2, 4.425, 5.035, 5.81), amountN = c(0.55,
0.52, 0.33, 0.27, 0.51, 0.47, 0.25, 0.11, 0.17, 0.48, 0.35, 0.4,
14.95, 23.6, 26.2), ID = c(28L, 28L, 28L, 28L, 28L, 28L, 28L,
28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L)), class = "data.frame",
row.names = c("64693",
"64702", "64708", "64714", "64720", "64726", "64732", "64735",
"64741", "64750", "64756", "64762", "65150", "65156", "65163"
))
idvector <- c(10, 25, 28)
You could directly avoid the loop ans using %in% over df and indexing:
#Code
vec <- df$amountN[df$ID %in% idvector]
Output:
[1] 0.55 0.52 0.33 0.27 0.51 0.47 0.25 0.11 0.17 0.48 0.35 0.40
Edit: For anyone coming later: THIS IS NOT A DUPLICATE, since it explicitely concerns work on data frames, not single variables/vectors.
I have found several sites describing how to drop leading zeros in numbers or strings, including vectors. But none of the descriptions I found seem applicable to data frames.
Or the f_num function in the numform package. It treats "[a] vector of numbers (or string equivalents)", but does not seem to solve unwanted leading zeros in a data frame.
I am relatively new to R but understand that I could develop some (in my mind) complex code to drop leading zeros by subsetting vectors from a data frame and then combining those vectors into a full data frame. I would like to avoid that.
Here is a simple data frame:
df <- structure(list(est = c(0.05, -0.16, -0.02, 0, -0.11, 0.15, -0.26,
-0.23), low2.5 = c(0.01, -0.2, -0.05, -0.03, -0.2, 0.1, -0.3,
-0.28), up2.5 = c(0.09, -0.12, 0, 0.04, -0.01, 0.2, -0.22, -0.17
)), row.names = c(NA, 8L), class = "data.frame")
Which gives
df
est low2.5 up2.5
1 0.05 0.01 0.09
2 -0.16 -0.20 -0.12
3 -0.02 -0.05 0.00
4 0.00 -0.03 0.04
5 -0.11 -0.20 -0.01
6 0.15 0.10 0.20
7 -0.26 -0.30 -0.22
8 -0.23 -0.28 -0.17
I would want
est low2.5 up2.5
1 .05 .01 .09
2 -.16 -.20 -.12
3 -.02 -.05 .00
4 .00 -.03 .04
5 -.11 -.20 -.01
6 .15 .10 .20
7 -.26 -.30 -.22
8 -.23 -.28 -.17
Is that possible with relatively simple code for a whole data frame?
Edit: An incorrect link has been removed.
I am interpreting the intention of your question is to convert each numeric cell in the data.frame into a "pretty-printed" string which is possible using string substitution and a simple regular expression (a good question BTW since I do not know any method to configure the output of numeric data to suppress leading zeros without converting the numeric data into a string!):
df2 <- data.frame(lapply(df,
function(x) gsub("^0\\.", "\\.", gsub("^-0\\.", "-\\.", as.character(x)))),
stringsAsFactors = FALSE)
df2
# est low2.5 up2.5
# 1 .05 .01 .09
# 2 -.16 -.2 -.12
# 3 -.02 -.05 0
# 4 0 -.03 .04
# 5 -.11 -.2 -.01
# 6 .15 .1 .2
# 7 -.26 -.3 -.22
# 8 -.23 -.28 -.17
str(df2)
# 'data.frame': 8 obs. of 3 variables:
# $ est : chr ".05" "-.16" "-.02" "0" ...
# $ low2.5: chr ".01" "-.2" "-.05" "-.03" ...
# $ up2.5 : chr ".09" "-.12" "0" ".04" ...
If you want to get a fixed number of digits after the decimal point (as shown in the expected output but not asked for explicitly) you could use sprintf or format:
df3 <- data.frame(lapply(df, function(x) gsub("^0\\.", "\\.", gsub("^-0\\.", "-\\.", sprintf("%.2f", x)))), stringsAsFactors = FALSE)
df3
# est low2.5 up2.5
# 1 .05 .01 .09
# 2 -.16 -.20 -.12
# 3 -.02 -.05 .00
# 4 .00 -.03 .04
# 5 -.11 -.20 -.01
# 6 .15 .10 .20
# 7 -.26 -.30 -.22
# 8 -.23 -.28 -.17
Note: This solution is not robust against different decimal point character (different locales) - it always expects a decimal point...
I have a dataframe called "new_dat" containing the time (days) in column t, and temperature data (and occaisionally NA) in columns A - C (please see the example in the code below):
> new_dat
t A B C
1 0.00 0.82 0.88 0.46
2 0.01 0.87 0.94 0.52
3 0.02 NA NA NA
4 0.03 0.95 1.03 0.62
5 0.04 0.98 1.06 0.67
6 0.05 1.01 1.09 0.71
7 0.06 2.00 1.13 2.00
8 0.07 1.06 1.16 0.78
9 0.08 1.07 1.18 0.81
10 0.09 1.09 1.20 0.84
11 0.10 1.10 1.21 0.86
12 0.11 2.00 1.22 0.87
Here is a dput() of the dataframe:
structure(list(t = c(0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07,
0.08, 0.09, 0.1, 0.11), A = c(0.82, 0.870000000000001, NA,
0.949999999999999,
0.979999999999997, 1.01, 2, 1.06, 1.07, 1.09, 1.1, 2), B =
c(0.879999999999999,
0.940000000000001, NA, 1.03, 1.06, 1.09, 1.13, 1.16, 1.18, 1.2,
1.21, 1.22), C = c(0.460000000000001, 0.520000000000003, NA,
0.619999999999997, 0.669999999999998, 0.709999999999997, 2,
0.780000000000001,
0.809999999999999, 0.84, 0.859999999999999, 0.87)), .Names = c("t",
"A", "B", "C"), row.names = c(NA, 12L), class = "data.frame")
As output, I want a vector (list?) of the values of column t where the temperature reading from columns A-C >= 2 for the first time (and only the first time), or - if the temperature is never >= 2 - return the last time reading in column t (0.11 in my example). So 'A' would return the value 0.06 (and not 0.11), 'B' would have the value 0.11 and 'C' 0.06. I intended to use the vector generated to create a new dataframe something like this:
A B C
0.06 0.11 0.06
I'm inexperienced with R (and code in general) so, despite reading that looping can be ineficient (but not really understanding how to accomplish what i want without it), I tried to solve this by looping first by column and then by row as follows:
#create blank vector to add my results to
aer <- c()
#loop by column, then by row, adding values according to the if statement
for (c in 2:ncol(new_dat)){
c <- c
for (r in 1:nrow(new_dat)){
r <- r
if ((!is.na(new_dat[r,c] )) & (new_dat[r,c] >= 2)){
aer <- c(aer, new_dat$t[r])
}
}
}
This returns my vector, aer, as:
> aer
[1] 0.06 0.11 0.06
So it's returning both instances where 'A' is 2, and the one from column 'C'.
I dont know how to instruct the loop to stop and move to the next column after finding one instance where my 'if' statement is true. I also tried adding an 'else' to cover the situation where temperature doesnt exceed 2:
else {
aer <- c(aer, new_dat$t[nrow(new_dat)])
But this did not work.
I would appreciate any help in completing the code, or suggestions for a better solution.
library(tidyverse)
new_dat %>%
gather(col, temp, -t) %>% # reshape data
na.omit() %>% # remove rows with NAs
group_by(col) %>% # for each column value
summarise(v = ifelse(is.na(first(t[temp >= 2])), last(t), first(t[temp >= 2]))) %>% # return the last t value if there are no temp >=2 otherwise return the first t with temp >= 2
spread(col, v) # reshape again
# # A tibble: 1 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 0.06 0.11 0.06
This solution will create the dataframe for you automatically, instead of returning a vector for you to create the dataframe yourself.
Here is a two steps solution.
First get an index vector of the values you want, then use that index vector to subset the dataframe.
inx <- sapply(new_dat[-1], function(x) {
w <- which(x >= 2)
if(length(w)) min(w) else NROW(x)
})
new_dat[inx, 1]
#[1] 0.06 0.11 0.06
I have searched a lot for this simple question, but have not found a solution. It looks really simple. I have a dataframe with a column like this:
Value
0.13
0.35
0.62
0.97
0.24
0.59
0.92
0.16
0.29
0.62
0.98
All values have a range between 0 and 1. What I want is that when the value starts to drop, I assign a new group to it. Within each group, the value is increasing. So the ideal outcome will look like this:
Value Group
0.13 1
0.35 1
0.62 1
0.97 1
0.24 2
0.59 2
0.92 2
0.16 3
0.29 3
0.62 3
0.98 3
Does anyone have a suggestion for how to address this?
This should do the trick, and uses only vectorised base functions. You may want to exchange the < for <=, if thats the behaviour you wanted.
vec <- c(0.13, 0.35, 0.62, 0.97, 0.24, 0.59, 0.92, 0.16, 0.29, 0.62, 0.98)
cumsum(c(1, diff(vec) < 0))
This isn't the most elegant solution, but it works:
value <- c(0.13, 0.35, 0.62, 0.97, 0.24, 0.59, 0.92, 0.16, 0.29, 0.62, 0.98)
foo <- data.frame(value, group = 1)
current_group <- 1
for(i in 2:nrow(foo)){
if(foo$value[i] >= foo$value[i-1]){
foo$group[i] <- current_group
}else{
current_group <- current_group + 1
foo$group[i] <- current_group
}
}
df <- data.frame( x = c(0.13, 0.35, 0.62, 0.97, 0.24, 0.59, 0.92, 0.16, 0.29, 0.62, 0.98))
df$y <- c(df$x[-1], NA) # lag column
df$chgdir <- as.numeric(df$y - df$x < 0) # test for change in direction
df$chgdir[is.na(df$chgdir)] <- 0 # deal with NA
df$group <- cumsum(df$chgdir) + 1 # determine group number
df[,c("x", "group")]
#> x group
#> 1 0.13 1
#> 2 0.35 1
#> 3 0.62 1
#> 4 0.97 2
#> 5 0.24 2
#> 6 0.59 2
#> 7 0.92 3
#> 8 0.16 3
#> 9 0.29 3
#> 10 0.62 3
#> 11 0.98 3