Replacing values based on conditions - r

I have a dataframe one of the cols is id and some of the values have been messed up during the recording of the data.
here's an example of the type of data
dput(df)
structure(list(Id = c("'110171786'", "'1103fbfd5'", "'0700edf6dc'",
"'1103fad09'", "'01103fc9bb'", "''", "''", "0000fba2b'", "'01103fb169'",
"'01103fd723'", "'01103f9c34'", "''", "''", "''", "'01103fc088'",
"'01103fa6d8'", "'01103fb374'", "'01103fce8c'", "'01103f955d'",
"'011016e633'", "'01103fa0da'", "''", "''", "''", "'01103fa4bd'",
"'01103fb5c4'", "'01103fd0d7'", "'01103f9e2e'", "'01103fc657'",
"'01103fd4d1'", "'011016e78e'", "'01103fbda2'", "'01103fbae7'",
"'011016ee23'", "'01103fc847'", "'01103fbfbb'", "''", "'01103fb8bb'",
"'01103fc853'", "''", "'01103fbcd5'", "'011016e690'", "'01103fb253'",
"'01103fcb19'", "'01103fb446'", "'01103fa4fa'", "'011016cfbd'",
"'01103fd250'", "'01103fac7d'", "'011016a86e'"), Weight = c(11.5,
11.3, 11.3, 10.6, 10.6, 8.9, 18.7, 10.9, 11.3, 18.9, 18.9, 8.6,
8.8, 8.4, 11, 10.4, 10.4, 10.8, 11.2, 11, 10.3, 9.5, 8.1, 9.3,
10.2, 10.5, 11.2, 21.9, 18, 17.8, 11.3, 11.5, 10.8, 10.5, 12.8,
10.9, 8.9, 10.3, 10.8, 8.9, 10.9, 9.9, 19, 11.6, 11.3, 11.7,
10.9, 12.1, 11.3, 10.6)), class = "data.frame", row.names = c(NA,
-50L))
>
What I would like to do is search through the id column and replace the following mistakes
some of the values have a zero missing off the front, all of these would start with a 1 now instead which makes finding them easily. So basically anything that has a character length of 9 and starts with a 1 needs a 0 as the first character.
some of the values are less than 10 characters long, these need to be removed.
some have more than one leading 0 and these need to be removed.

df$Id <- gsub("^('?)(1.{8}')$", "\\10\\2", df$Id)
df[ !grepl("^'?(00|'$)", df$Id),]
# Id Weight
# 1 '0110171786' 11.5
# 2 '01103fbfd5' 11.3
# 3 '0700edf6dc' 11.3
# 4 '01103fad09' 10.6
# 5 '01103fc9bb' 10.6
# 9 '01103fb169' 11.3
# 10 '01103fd723' 18.9
# 11 '01103f9c34' 18.9
# 15 '01103fc088' 11.0
# 16 '01103fa6d8' 10.4
# 17 '01103fb374' 10.4
# 18 '01103fce8c' 10.8
# 19 '01103f955d' 11.2
# 20 '011016e633' 11.0
# 21 '01103fa0da' 10.3
# 25 '01103fa4bd' 10.2
# 26 '01103fb5c4' 10.5
# 27 '01103fd0d7' 11.2
# 28 '01103f9e2e' 21.9
# 29 '01103fc657' 18.0
# 30 '01103fd4d1' 17.8
# 31 '011016e78e' 11.3
# 32 '01103fbda2' 11.5
# 33 '01103fbae7' 10.8
# 34 '011016ee23' 10.5
# 35 '01103fc847' 12.8
# 36 '01103fbfbb' 10.9
# 38 '01103fb8bb' 10.3
# 39 '01103fc853' 10.8
# 41 '01103fbcd5' 10.9
# 42 '011016e690' 9.9
# 43 '01103fb253' 19.0
# 44 '01103fcb19' 11.6
# 45 '01103fb446' 11.3
# 46 '01103fa4fa' 11.7
# 47 '011016cfbd' 10.9
# 48 '01103fd250' 12.1
# 49 '01103fac7d' 11.3
# 50 '011016a86e' 10.6

Related

Find the relative position of a specific number in an increasing vector

I have a simple question. I have a numeric vector with increasing order.
e.g.
[1] 13.5 13.9 14.2 14.5 14.8 15.2 16.0 16.9 17.4 17.8 18.3 18.7 19.4
and I want to find the relative position of a specific number between these vectors.
e.g.
f(13)
' < 13.5 '
f(15.0)
' 14.8 <= < 15.2
You can use cut():
vec <- c(13.5, 13.9, 14.2, 14.5, 14.8, 15.2, 16.0, 16.9, 17.4, 17.8, 18.3, 18.7, 19.4)
cut(13, c(-Inf, vec, Inf))
# [1] (-Inf,13.5]
cut(15, c(-Inf, vec, Inf))
# [1] (14.8,15.2]

Finding distance between a row and the row two above it in R

I would like to efficiently compute distances between every row in a matrix and the row two rows above it in R...
My attempts at finding a dplyr rowwise solution with lag(., n = 2) have failed, and I'm sure there's a better solution than this for loop.
Thoughts are much appreciated!
library(rdist)
library(tidyverse)
structure(list(sodium = c(140, 152.6, 138, 152.4, 140, 152.6,
141, 152.7, 141, 152.7), chloride = c(103, 148.9, 104, 149, 102,
148.8, 103, 148.9, 104, 149), potassium_plas = c(3.4, 0.34, 4.1,
0.41, 3.7, 0.37, 4, 0.4, 3.7, 0.37), co2_totl = c(31, 3.1, 22,
2.2, 23, 2.3, 27, 2.7, 20, 2), bun = c(11, 1.1, 5, 0.5, 8, 0.8,
21, 2.1, 10, 1), creatinine = c(0.84, 0.084, 0.53, 0.053, 0.69,
0.069, 1.04, 0.104, 1.86, 0.186), calcium = c(9.3, 0.93, 9.8,
0.98, 9.4, 0.94, 9.4, 0.94, 9.1, 0.91), glucose = c(102, 10.2,
99, 9.9, 115, 11.5, 94, 9.4, 122, 12.2), anion_gap = c(6, 0.599999999999989,
12, 1.20000000000001, 15, 1.50000000000001, 11, 1.09999999999998,
17, 1.69999999999999)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
dist_prior <- rep(NA, n = nrow(input_labs))
for(i in 3:nrow(input_labs)){
dist_prior[i] <- cdist(input_labs[i,], input_labs[i-2,])
}
We could loop over the sequence of rows in map and apply the function, append NAs at the beginning to make the length correct
library(dplyr)
library(rdist)
library(purrr)
input_labs %>%
mutate(dist_prior = c(NA_real_, NA_real_,
map_dbl(3:n(), ~ cdist(cur_data()[.x,], cur_data()[.x-2, ]))))
-output
# A tibble: 10 × 10
sodium chloride potassium_plas co2_totl bun creatinine calcium glucose anion_gap dist_prior
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 140 103 3.4 31 11 0.84 9.3 102 6 NA
2 153. 149. 0.34 3.1 1.1 0.084 0.93 10.2 0.600 NA
3 138 104 4.1 22 5 0.53 9.8 99 12 13.0
4 152. 149 0.41 2.2 0.5 0.053 0.98 9.9 1.20 1.30
5 140 102 3.7 23 8 0.69 9.4 115 15 16.8
6 153. 149. 0.37 2.3 0.8 0.069 0.94 11.5 1.50 1.68
7 141 103 4 27 21 1.04 9.4 94 11 25.4
8 153. 149. 0.4 2.7 2.1 0.104 0.94 9.4 1.10 2.54
9 141 104 3.7 20 10 1.86 9.1 122 17 31.5
10 153. 149 0.37 2 1 0.186 0.91 12.2 1.70 3.15
Or may split by row on the original data and the laged one and use map2 to loop over the list and apply
input_labs$dist_prior <- map2_dbl(
asplit(lag(input_labs, n = 2), 1),
asplit(input_labs, 1),
~ cdist(as.data.frame.list(.x), as.data.frame.list(.y))[,1])
in Base R you can use diff and rowSums as shown below:
c(NA, NA, sqrt(rowSums(diff(as.matrix(input_labs), 2)^2)))
[1] NA NA 12.955157 1.295516 16.832873 1.683287 25.381342 2.538134 31.493688 3.149369
You can cbind the results to the original dataframe.

Removing blank rows in the two data frames and combine them

I have a dataset which contains two groups. First 3 columns are 1st group and next 3 columns are 2nd group. They contains missing values at random manner.
I have to delete the rows containing complete missing values in any one group. And both group contains at least one value in the row.
At last I have to combine both the groups.
I have tried many R codes. Please suggest some useful R function for this issue.
example data structure
If your data is properly named, this can be done using starts_with and if_any (otherwise you might rename your columns first as you see fit)
library(tidyverse)
df <- tribble(
~x1, ~x2, ~x3, ~y1, ~y2, ~y3,
26.4, 26.5, 26.6, 26.7, 26.4, 26.5,
NA, NA, NA, 23.7, NA, NA,
27.2, 28.0, 27.9, 27.6, 27.8, 27.7,
NA, 24.2, 24.9, 23.9, 24.9, 24.0,
24.3, NA, 24.3, 24.0, 24.1, 24.5,
26.9, 26.7, 27.0, 26.9, 26.8, 26.8,
24.4, 24.4, 24.5, 24.8, 24.3, 24.3,
NA, NA, NA, 23.9, NA, NA,
NA, NA, NA, 23.9, NA, NA,
24.9, NA, NA, 24.9, NA, NA,
NA, NA, NA, 24.5, NA, NA,
28.3, 28.2, 28.3, 28.2, 28.4, 28.3,
28.3, 28.4, 28.1, 28.3, 28.3, 28.2
)
df %>% filter(!if_all(starts_with("x"), is.na) & !if_all(starts_with("y"), is.na))
#> # A tibble: 9 × 6
#> x1 x2 x3 y1 y2 y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 26.4 26.5 26.6 26.7 26.4 26.5
#> 2 27.2 28 27.9 27.6 27.8 27.7
#> 3 NA 24.2 24.9 23.9 24.9 24
#> 4 24.3 NA 24.3 24 24.1 24.5
#> 5 26.9 26.7 27 26.9 26.8 26.8
#> 6 24.4 24.4 24.5 24.8 24.3 24.3
#> 7 24.9 NA NA 24.9 NA NA
#> 8 28.3 28.2 28.3 28.2 28.4 28.3
#> 9 28.3 28.4 28.1 28.3 28.3 28.2
Created on 2022-06-18 by the reprex package (v2.0.1)
I am not sure I understand your question, but here is a demonstration using dplyr::if_all() , dplyr::if_any()
library(tidyverse)
# Example data
# have to delete the rows containing complete missing values in any one group.
# And both group contains at least one value in the row. (Not sure what that means)
# At last I have to combine both the groups.
d <- tibble::tribble(
~gr1_col1, ~gr1_col2, ~gr1_col3, ~gr2_col1, ~gr2_col2, ~gr2_col3,
1, 2, NA, 1, 1, 1,
NA, NA, NA, NA, 1, 1,
NA, 1, 1, NA, NA, 1,
1, NA, 2, NA, NA, NA,
)
d
#> # A tibble: 4 x 6
#> gr1_col1 gr1_col2 gr1_col3 gr2_col1 gr2_col2 gr2_col3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 NA 1 1 1
#> 2 NA NA NA NA 1 1
#> 3 NA 1 1 NA NA 1
#> 4 1 NA 2 NA NA NA
d %>%
dplyr::filter(
# First group
!dplyr::if_all(.cols = c(1, 2, 3), .fns = is.na), # removing rows if all columns 1, 2 and 3 are NA
# second group
!if_all(.cols = c(4, 5, 6), .fns = is.na) # removing rows if all columns 1, 2 and 3 are NA
)
#> # A tibble: 2 x 6
#> gr1_col1 gr1_col2 gr1_col3 gr2_col1 gr2_col2 gr2_col3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 NA 1 1 1
#> 2 NA 1 1 NA NA 1
# Not sure what you mean with how you want to combine groups
Created on 2022-06-17 by the reprex package (v2.0.1)

Summation series in R

I am currently trying to simplify this summation. I am new to R.
Data
Lx = c(5050.0, 65.0, 25.0, 19.0, 17.5, 16.5, 15.5, 14.5, 13.5, 12.5, 6.0, 0.0)
Summation series
Tx = c(sum(Lx[1:12]),sum(Lx[2:12]),sum(Lx[3:12]),sum(Lx[4:12]),
sum(Lx[5:12]),sum(Lx[6:12]),sum(Lx[7:12]),sum(Lx[8:12]),
sum(Lx[9:12]),sum(Lx[10:12]),sum(Lx[11:12]),sum(Lx[12:12]))
You can do:
rev(cumsum(rev(Lx)))
[1] 5255.0 205.0 140.0 115.0 96.0 78.5 62.0 46.5 32.0 18.5 6.0 0.0
Or alternatively, using Reduce():
Reduce(`+`, Lx, right = TRUE, accumulate = TRUE)
[1] 5255.0 205.0 140.0 115.0 96.0 78.5 62.0 46.5 32.0 18.5 6.0 0.0
Using a for loop:
Tx_new <- vector(length = length(Lx))
for (i in 1:length(Lx)) {
Tx_new[i] <- sum(Lx[i:length(Lx)])
}
A possible solution, using sapply:
sapply(1:12, function(x) sum(Lx[x:12]))
#> [1] 5255.0 205.0 140.0 115.0 96.0 78.5 62.0 46.5 32.0 18.5
#> [11] 6.0 0.0
Package spatstat.utils provides a fast version ("under certain conditions") of the reverse cumulative sum, revcumsum, which is based on computing sum(x[i:n]) with n = length(x) (basically #Jan Brederecke's answer):
Lx = c(5050.0, 65.0, 25.0, 19.0, 17.5, 16.5, 15.5, 14.5, 13.5, 12.5, 6.0, 0.0)
# install.packages("spatstat.utils")
spatstat.utils::revcumsum(Lx)
# [1] 5255.0 205.0 140.0 115.0 96.0 78.5 62.0 46.5 32.0 18.5 6.0 0.0
Benchmark
x = c(5050.0, 65.0, 25.0, 19.0, 17.5, 16.5, 15.5, 14.5, 13.5, 12.5, 6.0, 0.0)
bm <- microbenchmark(
fRev(x),
fReduce(x),
fJan(x),
fEshita(x),
fsapply(x),
fRevcumsum(x),
times = 100L
)
autoplot(bm)
rev(cumsum(rev(Lx))) and spatstat.utils::revcumsum(Lx) seem like the fastest solutions.
Please try the following code:
Lx = c(5050.0, 65.0, 25.0, 19.0, 17.5, 16.5, 15.5, 14.5, 13.5, 12.5, 6.0, 0.0)
l=length(Lx)
aa=list()
for(i in 1:l)
{
x=sum((Lx[i:l]))
aa=append(aa,x)
}
all the values after summation will be in the list "aa".

R programming help in editing code

I've asked many questions about this and all the answers were really helpful...but once again my data is weird and I need help...Basically, what I want to do is find the average speed at a certain range of intervals...lets say from 6 s to 40 s my average speed would be 5 m/s...etc etc..
So it was pointed out to me to use this code...
library(IRanges)
idx <- seq(1, ncol(data), by=2)
# idx is now 1, 3, 5. It will be passed one value at a time to `i`.
# that is, `i` will take values 1 first, then 3 and then 5 and each time
# the code within is executed.
o <- lapply(idx, function(i) {
ir1 <- IRanges(start=seq(0, max(data[[i]]), by=401), width=401)
ir2 <- IRanges(start=data[[i]], width=1)
t <- findOverlaps(ir1, ir2)
d <- data.frame(mean=tapply(data[[i+1]], queryHits(t), mean))
cbind(as.data.frame(ir1), d)
})
which gives this output
# > o
# [[1]]
# start end width mean
# 1 0 400 401 1.05
#
# [[2]]
# start end width mean
# 1 0 400 401 1.1
#
# [[3]]
# start end width mean
# 1 0 400 401 1.383333
So if I wanted it to be every 100 s... I'll just change ir1 <- ....., by = 401 to become by=100.
But my data is weird because of a few things
my data doesnt always start with 0 s sometimes it starts at 20 s...depending on the specimen and whether it moves
My data collection does not happen every 1s or 2s or 3s. Hence sometimes I get data 1-20 s but it skips over 20-40 s simply because the specimen does not move.
I think the findOverlaps portion of the code affects my output. How can I get rid of that without disturbing the output?
Here is some data to illustrate my troubles...but all of my real data ends in 2000s
Time Speed Time Speed Time Speed
6.3 1.6 3.1 1.7 0.3 2.4
11.3 1.3 5.1 2.2 1.3 1.3
13.8 1.3 6.3 3.4 3.1 1.5
14.1 1.0 7.0 2.3 4.5 2.7
47.4 2.9 11.3 1.2 5.1 0.5
49.2 0.7 26.5 3.3 5.9 1.7
50.5 0.9 27.3 3.4 9.7 2.4
57.1 1.3 36.6 2.5 11.8 1.3
72.9 2.9 40.3 1.1 13.1 1.0
86.6 2.4 44.3 3.2 13.8 0.6
88.5 3.4 50.9 2.6 14.0 2.4
89.0 3.0 62.6 1.5 14.8 2.2
94.8 2.9 66.8 0.5 15.5 2.6
117.4 0.5 67.3 1.1 16.4 3.2
123.7 3.2 67.7 0.6 26.5 0.9
124.5 1.0 68.2 3.2 44.7 3.0
126.1 2.8 72.1 2.2 45.1 0.8
As you can see from the data, it doesnt necessarily end in 60 s etc sometimes it only ends at 57 etc
EDIT add dput of data
structure(list(Time = c(6.3, 11.3, 13.8, 14.1, 47.4, 49.2, 50.5,
57.1, 72.9, 86.6, 88.5, 89, 94.8, 117.4, 123.7, 124.5, 126.1),
Speed = c(1.6, 1.3, 1.3, 1, 2.9, 0.7, 0.9, 1.3, 2.9, 2.4,
3.4, 3, 2.9, 0.5, 3.2, 1, 2.8), Time.1 = c(3.1, 5.1, 6.3,
7, 11.3, 26.5, 27.3, 36.6, 40.3, 44.3, 50.9, 62.6, 66.8,
67.3, 67.7, 68.2, 72.1), Speed.1 = c(1.7, 2.2, 3.4, 2.3,
1.2, 3.3, 3.4, 2.5, 1.1, 3.2, 2.6, 1.5, 0.5, 1.1, 0.6, 3.2,
2.2), Time.2 = c(0.3, 1.3, 3.1, 4.5, 5.1, 5.9, 9.7, 11.8,
13.1, 13.8, 14, 14.8, 15.5, 16.4, 26.5, 44.7, 45.1), Speed.2 = c(2.4,
1.3, 1.5, 2.7, 0.5, 1.7, 2.4, 1.3, 1, 0.6, 2.4, 2.2, 2.6,
3.2, 0.9, 3, 0.8)), .Names = c("Time", "Speed", "Time.1",
"Speed.1", "Time.2", "Speed.2"), class = "data.frame", row.names = c(NA,
-17L))
sorry if i don't understand your question entirely, could you explain why this example doesn't do what you're trying to do?
# use a pre-loaded data set
mtcars
# choose which variable to cut
var <- 'mpg'
# define groups, whether that be time or something else
# and choose how to cut it.
x <- cut( mtcars[ , var ] , c( -Inf , seq( 15 , 25 , by = 2.5 ) , Inf ) )
# look at your cut points, for every record
x
# you can merge them back on to the mtcars data frame if you like..
mtcars$cutpoints <- x
# ..but that's not necessary
# find the mean within those groups
tapply(
mtcars[ , var ] ,
x ,
mean
)
# find the mean within groups, using a different variable
tapply(
mtcars[ , 'wt' ] ,
x ,
mean
)

Resources