Expand rows based on arbitrary N and M - r

Let's say I have a parameter V which is equal to 100 on row v. I have another parameter D on row d that is equal to 100 too. I would like to create a dataframe with N rows above row v and M rows below row d, so that v-Nth row is equal to V + Nf and d+Mth row is equal to D - Mf. For this example, let's assume that f is equal to 5:
Input:
> dput(df)
structure(list(rw = structure(2:1, .Label = c("d", "v"), class = "factor"),
vals = c(100, 100)), class = "data.frame", row.names = c(NA,
-2L))
Expected output:
> dput(df)
structure(list(rw = structure(c(8L, 7L, 6L, 5L, 1L, 2L, 3L, 4L
), .Label = c("d", "d+1", "d+2", "d+M", "v", "v-1", "v-2", "v-N"
), class = "factor"), vals = c(85, 90, 95, 100, 100, 105, 110,
115)), class = "data.frame", row.names = c(NA, -8L))
How can I achieve this? Let me know if you have any questions.
Thanks!

We can use Map with seq to create a sequence of values by looping over the rows of 'vals' column, then pass another variable byval with sign for getting the sequence backwards or forwards for each corresponding element of 'vals' and pass it on the by argument in seq while setting the constant length of sequence to output as 4 ('n'). Then, set the names of the list wiith the 'rw' column and convert the list to a two column data.frame with stack
byval <- 5
n <- 4
stack(setNames(Map(function(x, y) sort(seq(x, length.out = n, by = y)),
df$vals, c(-byval, byval)), df$rw))[2:1]
If we need different length.out, pass it as another parameter
n1 <- 6
n2 <- 5
stack(setNames(Map(function(x, y, z) sort(seq(x, length.out = z, by = y)),
df$vals, c(-byval, byval), c(n1, n2)), df$rw))[2:1]

Related

Selecting columns where only one item is true

I am using the following code to determine if any of the columns in my data table have 1065. If any of the columns do have 1065, I get "TRUE" which works perfectly. Now I want to only output true if any of the columns notcancer0:notcancer33 contains 1065 AND all the rest are NA. Other columns may contain other values like 1064, 1066, etc. But I want to output "TRUE" for the rows where there is only 1065 and all the rest of the columns contain NAs for that row. What is the best way to do this?
biobank_nsaid[, ischemia1 := Reduce(`|`, lapply(.SD, `==`, "1065")), .SDcols=notcancer0:notcancer33]
Sample data:
biobank_nsaid = structure(list(aspirin = structure(c(2L, 1L, 1L, 1L), .Label =
c("FALSE", "TRUE"), class = "factor"), aspirinonly = c(TRUE, FALSE, FALSE,
FALSE), med0 = c(1140922174L, 1140871050L, 1140879616L, 1140909674L ), med1 =
c(1140868226L, 1140876592L, 1140869180L, NA), med2 = c(1140879464L, NA,
1140865016L, NA), med3 = c(1140879428L, NA, NA, NA)), row.names = c(NA, -4L),
class = c("data.table", "data.frame"))
Here are 2 options:
setDT(biobank_nsaid)[, ischemia1 :=
rowSums(is.na(.SD))==ncol(.SD)-1L & rowSums(.SD==1140909674, na.rm=TRUE)==1L,
.SDcols=med0:med3]
Or after some boolean manipulations:
biobank_nsaid[, ic2 :=
!(rowSums(is.na(.SD))!=ncol(.SD)-1L | rowSums(.SD==1140909674, na.rm=TRUE)!=1L),
.SDcols=med0:med3]

Return row(i) if two columns match

I have two datasets:
df1
ID paddock cow ID
90/123 10 09/123
90/124 11 09/124
90/125 11 09/124
df2
ID paddock
09/123 20
09/124 21
I would like to match df1$cowID with df2$ID and return df2$paddock for whatever row matches. My current code is as follows:
dt <- ifelse(df1$cowID %in% df2$ID, df2$paddock[i], NA)
But I'm getting a return error. Could someone direct me in the right direction please? Thanks in advance!
You might consider joining the datasets.
dplyr::left_join(df1, df2, by = c('cow ID', 'ID')
You should probably use match :
df1$df2_paddock <- df2$paddock[match(df1$cow_ID, df2$ID)]
df1
# ID paddock cow_ID df2_paddock
#1 90/123 10 09/123 20
#2 90/124 11 09/124 21
data
df1 <- structure(list(ID = structure(1:2, .Label = c("90/123", "90/124"
), class = "factor"), paddock = 10:11, cow_ID = structure(1:2, .Label = c("09/123",
"09/124"), class = "factor")), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = 20:21), class = "data.frame",
row.names = c(NA, -2L))
You can do that by joining the two dataframes and getting the column that you want.
Using Base R
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
merge(df1, df2, by = c("ID"), suffixes = c(".x", ".y"))["paddock.y"]
# paddock.y
# 20
# 21
Using Dplyr
library(dplyr)
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
df1 %>%
inner_join(df2, by = c("ID"), suffixes = c(".x", ".y")) %>%
select(paddock.y) %>%
rename(paddock = paddock.y)
# paddock
# 20
# 21
If you would like to use ifelse(), maybe you can use the following code to make it
with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
such that
> with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
[1] 20 21
DATA
df1 <- structure(list(ID = structure(1:3, .Label = c("90/123", "90/124",
"90/125"), class = "factor"), paddock = c(10, 11, 11), cow_ID = structure(c(1L,
2L, 2L), .Label = c("09/123", "09/124"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = c(20, 21)), class = "data.frame", row.names = c(NA,
-2L))

Control number of rows when binding dataframes with different number of rows?

I have a dataframe generated by a function:
Each time it's of different number of rows:
structure(list(a = c(1, 2, 3), b = c("er", "gd", "ku"), c = c(43,
453, 12)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
structure(list(a = c(1, 2), b = c("er", "gd"), c = c(43, 453)), .Names = c("a",
"b", "c"), row.names = c(NA, -2L), class = c("tbl_df", "tbl",
"data.frame"))
I want to be able like in a while loop to control the number of rows to be less then n (n = 4, 100, 4242...) when I bind rows.
Please advise how to do this using functional programming without a while loop?
I mean sometimes you will get n = 10 and the df before bind_rows is 7 and after binding the last one it will be 20. It's ok, I want the number of rows to be min_k (k >= n)
Here is my while loop doing this:
b <- list()
total_rows <- 0
while(total_rows < 1000) {
df <- f_produce_rand_df()
b[[length(b) + 1]] <- df
total_rows <- total_rows + nrow(df)
}

Improvement in for loop using other method

Problem
There is 1 main station (df) and 3 local stations (s) stacked in a single data.frame with values for three days. The idea is to take each day from the main station, find the relative anomaly of the three local stations, and smooth it using inverse distance weighting (IDW) from the phylin package. This is then applied to the value in the main station by multiplication.
Any suggestions on improving this code (e.g. data.table, dplyr, apply)? I still don't know how to approach this problem without the cumbersome for loop.
dput
s <- structure(list(id = c("USC00031152", "USC00034638", "USC00036352",
"USC00031152", "USC00034638", "USC00036352", "USC00031152", "USC00034638",
"USC00036352"), lat = c(33.59, 34.7392, 35.2833, 33.59, 34.7392,
35.2833, 33.59, 34.7392, 35.2833), long = c(-92.8236, -90.7664,
-93.1, -92.8236, -90.7664, -93.1, -92.8236, -90.7664, -93.1),
year = c(1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900,
1900), month = c(1, 1, 1, 1, 1, 1, 1, 1, 1), day = c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), value = c(63.3157576809045,
86.0490598902219, 76.506386949066, 71.3760752788486, 89.9119576975542,
76.3535163951321, 53.7259645981243, 61.7989638892985, 85.8911224149051
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-9L), .Names = c("id", "lat", "long", "year", "month", "day",
"value"))
df <- structure(list(id = c(12345, 12345, 12345), lat = c(100, 100,
100), long = c(50, 50, 50), year = c(1900, 1900, 1900), month = c(1,
1, 1), day = 1:3, value = c(54.8780020601509, 106.966029162171,
98.3198828955801)), row.names = c(NA, -3L), class = "data.frame", .Names = c("id",
"lat", "long", "year", "month", "day", "value"))
Code
library(phylin)
nearest <- function(i, loc){
# Stack 3 local stations
stack <- s[loc:(loc+2),]
# Get 1 main station
station <- df[i,]
# Check for NA and build relative anomaly (r)
stack <- stack[!is.na(stack$value),]
stack$r <- stack$value/station$value
# Use IDW and return v
v <- as.numeric(ifelse(dim(stack)[1] == 1,
stack$r,
idw(stack$r, stack[,c(2,3,8)], station[,2:3])))
return(v)
}
ncdc <- 1
for (i in 1:nrow(df)){
# Get relative anomaly from function
r <- nearest(i, ncdc)
# Get value from main station and apply anomaly
p <- df[i,7]
df[i,7] <- p*r
# Iterate to next 3 local stations
ncdc <- ncdc + 3
}
Suppose you let your nearest function unchanged.
Then you could get your new value column in df by
newvalue <- sapply(1:NROW(df), function (i) df[i,7] * nearest(i, 3*(i-1)+1))
df$value <- newvalue

Function defining answer by a vector

Looking to learn function writing. I have data laid out in the following (e.g.):
Genus Species Wing Tail
A X 10.5 20.3
A Y 10.7 20.7
B XX 15.2 22.5
B XY 15.5 24
I calculate variance for a given trait using the equation:
sqrt(max(Wing) - min (Wing))
which I sum for all traits.
So I can write the following function so sum variance for the total data set:
variance<- function(data){
t <- sqrt(max(Tail)-min(Tail))
w <- sqrt(max(Wing)-min(Wing))
x <- sum(t,w)
x
}
But I can'twork out how to generate a response to give me an output where this result is dependant on the Genus. So i'm looking to generate an output like:
Genus A Genus B
2.345 3.456
I am going to give a new name to your function because it's just wrong to call it "variance". I hope you can overlook that. We can work on a dataframe object
dput(dfrm)
structure(list(Genus = structure(c(1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), Species = structure(c(1L, 4L, 2L, 3L
), .Label = c("X", "XX", "XY", "Y"), class = "factor"), Wing = c(10.5,
10.7, 15.2, 15.5), Tail = c(20.3, 20.7, 22.5, 24)), .Names = c("Genus",
"Species", "Wing", "Tail"), class = "data.frame", row.names = c(NA,
-4L))
dev2<- function(df){
t <- sqrt(max(df[["Tail"]])-min(df[["Tail"]]))
w <- sqrt(max(df[["Wing"]])-min(df[["Wing"]]))
x <- sum(t,w)
x
}
Now use it to work on the full dataframe, using the split-lapply strategy, which passes sections of the original dataframe determined by the Genus values to the dev2 function
lapply( split(dfrm, list(dfrm$Genus)), FUN = dev2)
$A
[1] 1.079669
$B
[1] 1.772467

Resources