Change labels from numeric to numeric - r

I have the following dataset.
dat2 <- read.table(header=TRUE, text="
ID De Ep Ti ID1
1123 113 121 100 11231
1123 105 107 110 11232
1134 122 111 107 11241
1134 117 120 111 11242
1154 122 116 109 11243
1165 108 111 118 11251
1175 106 115 113 11252
1185 113 104 108 11253
1226 109 119 116 11261
")
dat2
ID De Ep Ti ID1
1 1 2 121 100 11231
2 1 1 107 110 11232
3 2 3 111 107 11241
4 2 2 120 111 11242
5 2 3 116 109 11243
6 3 1 111 118 11251
7 3 1 115 113 11252
8 4 2 104 108 11253
9 4 1 119 116 11261
I want to change first two columns to be changed like the following numeric labels. But it turns them into factor.
dat2$ID <- cut(dat2$ID, breaks=c(0,1124,1154,1184,Inf),
labels=c(5, 25, 55, 75))
table(dat2$ID)
5 25 55 75
2 3 2 2
dat2$De <- cut(dat2$De, breaks=c(0,110,118,125,Inf),
labels=c(10, 20, 30, 40))
table(dat2$De)
10 20 30 40
4 3 2 0
str(dat2)
'data.frame': 9 obs. of 5 variables:
$ ID : Factor w/ 4 levels "5","25","55",..: 1 1 2 2 2 3 3 4 4
$ De : Factor w/ 4 levels "10","20","30",..: 2 1 3 2 3 1 1 2 1
$ Ep : int 121 107 111 120 116 111 115 104 119
$ Ti : int 100 110 107 111 109 118 113 108 116
$ ID1: int 11231 11232 11241 11242 11243 11251 11252 11253 11261
I used as.numeric to convert them back to numeric that eventually creates new labeling (like 1, 2, 3) what I don't want. I need a simple line of code to transform it easily.
dat2$ID <- as.numeric(dat2$ID)
table(dat2$ID)
1 2 3 4
2 3 2 2
dat2$De <- as.numeric(dat2$De)
table(dat2$De)
1 2 3
4 3 2

In your case it will probably be more efficient to use findInterval directly instead of converting numeric to factors and then back to numeric values as shown here
c(5, 25, 55, 75)[findInterval(dat2$ID, c(0, 1124, 1154, 1184, Inf))]
## [1] 5 5 25 25 55 55 55 75 75
Or (as per the second column)
c(10, 20, 30, 40)[findInterval(dat2$De, c(0, 110, 118, 125, Inf))]
## [1] 20 10 30 20 30 10 10 20 10
Which is equivalent to using cut but returns the numeric values directly
cut(dat2$ID, breaks=c(0, 1124, 1154, 1184, Inf), labels=c(5, 25, 55, 75))
# [1] 5 5 25 25 25 55 55 75 75
# Levels: 5 25 55 75
Here's a quick benchmark showing ~X18 speed improvement
set.seed(123)
x <- sample(1e8, 1e7, replace = TRUE)
system.time({
res1 <- cut(x, breaks = c(0, 1e4, 1e5, 1e6, Inf), labels = c(5, 25, 55, 75))
res1 <- as.numeric(levels(res1))[res1]
})
# user system elapsed
# 3.40 0.09 3.51
system.time(res2 <- c(5, 25, 55, 75)[findInterval(x, c(0, 1e4, 1e5, 1e6, Inf))])
# user system elapsed
# 0.18 0.03 0.20
identical(res1, res2)
## [1] TRUE

Related

R - Reducing a matrix

I have a square matrix that is like:
A <- c("111","111","111","112","112","113")
B <- c(100,10,20,NA,NA,10)
C <- c(10,20,40,NA,10,20)
D <- c(10,20,NA,NA,40,200)
E <- c(20,20,40,10,10,20)
F <- c(NA,NA,40,100,10,20)
G <- c(10,20,NA,30,10,20)
df <- data.frame(A,B,C,D,E,F,G)
names(df) <- c("Codes","111","111","111","112","112","113")
# Codes 111 111 111 112 112 113
# 1 111 100 10 10 20 NA 10
# 2 111 10 20 20 20 NA 20
# 3 111 20 40 NA 40 40 NA
# 4 112 NA NA NA 10 100 30
# 5 112 NA 10 40 10 10 10
# 6 113 10 20 200 20 20 20
I want to reduce it so that observations with the same row and column names are summed up.
So I want to end up with:
# Codes 111 112 113
# 1 111 230 120 30
# 2 112 50 130 40
# 3 113 230 40 20
I tried to first combine the rows with the same "Codes" number, but I was having a lot of trouble.
In tidyverse
library(tidyverse)
df %>%
pivot_longer(-Codes, values_drop_na = TRUE) %>%
group_by(Codes, name) %>%
summarise(value = sum(value), .groups = 'drop')%>%
pivot_wider()
# A tibble: 3 x 4
Codes `111` `112` `113`
<chr> <dbl> <dbl> <dbl>
1 111 230 120 30
2 112 50 130 40
3 113 230 40 20
One way in base R:
tapply(unlist(df[-1]), list(names(df)[-1][col(df[-1])], df[,1][row(df[-1])]), sum, na.rm = TRUE)
111 112 113
111 230 50 230
112 120 130 40
113 30 40 20
Note that this can be simplified as denoted by #thelatemail to
grp <- expand.grid(df$Codes, names(df)[-1])
tapply(unlist(df[-1]), grp, FUN=sum, na.rm=TRUE)
You can also use `xtabs:
xtabs(vals~., na.omit(cbind(grp, vals = unlist(df[-1]))))
Var2
Var1 111 112 113
111 230 120 30
112 50 130 40
113 230 40 20
When dealing with actual matrices - especially with large ones -, expressing the operation as (sparse) linear algebra should be most efficient.
library(Matrix) ## for sparse matrix operations
idx <- c("111","111","111","112","112","113")
mat <- matrix(c(100,10,20,NA,NA,10,
10,20,40,NA,10,20,
10,20,NA,NA,40,200,
20,20,40,10,10,20,
NA,NA,40,100,10,20,
10,20,NA,30,10,20),
nrow=length(idx),
byrow=TRUE, dimnames=list(idx, idx))
## convert NA's to zero
mat[is.na(mat)] <- 0
## examine matrix
mat
## 111 111 111 112 112 113
## 111 100 10 20 0 0 10
## 111 10 20 40 0 10 20
## 111 10 20 0 0 40 200
## 112 20 20 40 10 10 20
## 112 0 0 40 100 10 20
## 113 10 20 0 30 10 20
## indicator matrix
## converts between "code" and "idx" spaces
M_code_idx <- fac2sparse(idx)
## project to "code_code" space
M_code_idx %*% mat %*% t(M_code_idx)
## 3 x 3 Matrix of class "dgeMatrix"
## 111 112 113
## 111 230 50 230
## 112 120 130 40
## 113 30 40 20

Sorting one variable in a data frame by id

I have a data frame with lot of company information separated by an id variable. I want to sort one of the variables and repeat it for every id. Let's take this example,
df <- structure(list(id = c(110, 110, 110, 90, 90, 90, 90, 252, 252
), var1 = c(26, 21, 54, 10, 18, 9, 16, 54, 39), var2 = c(234,
12, 43, 32, 21, 19, 16, 34, 44)), .Names = c("id", "var1", "var2"
), row.names = c(NA, -9L), class = "data.frame")
Which looks like this
df
id var1 var2
1 110 26 234
2 110 21 12
3 110 54 43
4 90 10 32
5 90 18 21
6 90 9 19
7 90 16 16
8 252 54 34
9 252 39 44
Now, I want to sort the data frame according to var1 by the vector id. Easiest solution I can think of is using apply function like this,
> apply(df, 2, sort)
id var1 var2
[1,] 90 9 12
[2,] 90 10 16
[3,] 90 16 19
[4,] 90 18 21
[5,] 110 21 32
[6,] 110 26 34
[7,] 110 39 43
[8,] 252 54 44
[9,] 252 54 234
However, this is not the output I am seeking. The correct output should be,
id var1 var2
1 110 21 12
2 110 26 234
3 110 54 43
4 90 9 19
5 90 10 32
6 90 16 16
7 90 18 21
8 252 39 44
9 252 54 34
Group by id and sort by var1 column and keep original id column order.
Any idea how to sort like this?
Note. As mentioned by Moody_Mudskipper, there is no need to use tidyverse and can also be done easily with base R:
df[order(ordered(df$id, unique(df$id)), df$var1), ]
A one-liner tidyverse solution w/o any temp vars:
library(tidyverse)
df %>% arrange(ordered(id, unique(id)), var1)
# id var1 var2
# 1 110 26 234
# 2 110 21 12
# 3 110 54 43
# 4 90 10 32
# 5 90 18 21
# 6 90 9 19
# 7 90 16 16
# 8 252 54 34
# 9 252 39 44
Explanation of why apply(df, 2, sort) does not work
What you were trying to do is to sort each column independently. apply runs over the specified dimension (2 in this case which corresponds to columns) and applies the function (sort in this case).
apply tries to further simplify the results, in this case to a matrix. So you are getting back a matrix (not a data.frame) where each column is sorted independently. For example this row from the apply call:
# [1,] 90 9 12
does not even exist in the original data.frame.
Another base R option using order and match
df[with(df, order(match(id, unique(id)), var1, var2)), ]
# id var1 var2
#2 110 21 12
#1 110 26 234
#3 110 54 43
#6 90 9 19
#4 90 10 32
#7 90 16 16
#5 90 18 21
#9 252 39 44
#8 252 54 34
We can convert the id to factor in order to split while preserving the original order. We can then loop over the list and order, and rbind again, i.e.
df$id <- factor(df$id, levels = unique(df$id))
do.call(rbind, lapply(split(df, df$id), function(i)i[order(i$var1),]))
# id var1 var2
#110.2 110 21 12
#110.1 110 26 234
#110.3 110 54 43
#90.6 90 9 19
#90.4 90 10 32
#90.7 90 16 16
#90.5 90 18 21
#252.9 252 39 44
#252.8 252 54 34
NOTE: You can reset the rownames by rownames(new_df) <- NULL
In base R we could use split<- :
split(df,df$id) <- lapply(split(df,df$id), function(x) x[order(x$var1),] )
or as #Markus suggests :
split(df, df$id) <- by(df, df$id, function(x) x[order(x$var1),])
output in either case :
df
# id var1 var2
# 1 110 21 12
# 2 110 26 234
# 3 110 54 43
# 4 90 9 19
# 5 90 10 32
# 6 90 16 16
# 7 90 18 21
# 8 252 39 44
# 9 252 54 34
With the following tidyverse pipe, the question's output is reproduced.
library(tidyverse)
df %>%
mutate(tmp = cumsum(c(0, diff(id) != 0))) %>%
group_by(id) %>%
arrange(tmp, var1) %>%
select(-tmp)
## A tibble: 9 x 3
## Groups: id [3]
# id var1 var2
# <dbl> <dbl> <dbl>
#1 110 21 12
#2 110 26 234
#3 110 54 43
#4 90 9 19
#5 90 10 32
#6 90 16 16
#7 90 18 21
#8 252 39 44
#9 252 54 34

R - allocate a share of a number over different columns using an ifelse statement

I have the following data set:
observation <- c(1:10)
pop.d.rank <- c(1:10)
cost.1 <- c(101:110)
cost.2 <- c(102:111)
cost.3 <- c(103:112)
all <- data.frame(observation,pop.d.rank,cost.1, cost.2, cost.3)
And I want to allocate the following amount of money over three years:
annual.investment <- 500
I can do this for the first year with the following script:
library(dplyr)
all <- all %>%
mutate(capital_allocated.5G = diff(c(0, pmin(cumsum(cost), annual.investment)))) %>%
mutate(capital_percentage.5G = capital_allocated.5G / cost * 100) %>%
mutate(year = ifelse(capital_percentage.5G >= 50, "Year.1",0))
But when I try to do this for the second year, taking into account the previous year's investment, the code does not work. Here is my attempt at putting an ifelse statement in the mutate loop so that it does not overwrite the money allocated in the previous year:
all <- all %>%
mutate(capital_allocated.5G = ifelse(year == 0, diff(c(0, pmin(cumsum(cost), annual.investment))), 0) %>%
mutate(capital_percentage.5G = capital_allocated.5G / cost * 100) %>%
mutate(year = ifelse(capital_percentage.5G >= 50, "Year.2",0))
I want the data to look like the following, where the amount allocated goes first to any row that hasn't been 100% completed from the previous year.
capital_allocated.5G <- c(101, 102, 103, 104, 105, 106, 107, 108, 109, 55)
capital_percentage.5G <- c(100, 100, 100, 100, 100, 100, 100, 100, 100, 50)
year <- c("Year.1", "Year.1","Year.1", "Year.1","Year.1", "Year.2", "Year.2","Year.2", "Year.2","Year.2")
example.output <- data.frame(observation,pop.d.rank,cost, capital_allocated.5G, capital_percentage.5G, year)
Edit: cost.1 is the cost variable for year 1, cost.2 is the variable for year 2 and cost.3 is the cost variable for year 3
EDIT: Problem with previously accepted answer
I've realised that this ends up allocating in excess of 100 for the capital_percentage.5G variable. I have created a reproducible example. I think this relates to the fact that some costs decrease over time and some costs increase over time.
The logic behind this is that when an investment is made in one year, there is a specific cost of deployment for a 5G mobile network and that is what the cost columns relate to for that point in time. Once that investment has been made in one year, I want the function to provide a capital_percentage.5G 100% and then not allocate any more capital to it in future years.
How do I get it so that the percentage value hits a limit at 100 and more of the capital allocation isn't allocated to it at a later date?
observation <- c(1:10)
pop.d.rank <- c(1:10)
cost.1 <- c(101:110)
cost.2 <- c(110:101)
cost.3 <- c(100:91)
all <- data.frame(observation,pop.d.rank,cost.1, cost.2, cost.3)
capital_allocated.5G <- rep(0,10) ## initialize to zero
capital_percentage.5G <- rep(0,10) ## initialize to zero
year <- rep(NA,10) ## initialize to NA
all <- data.frame(observation,pop.d.rank,cost.1, cost.2, cost.3, capital_allocated.5G,capital_percentage.5G,year)
alloc.invest <- function(df, ann.invest, y) {
df %>% mutate_(cost=paste0("cost.",y)) %>%
mutate(capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(capital_percentage.5G < 50, NA, year),
not.yet.alloc = ifelse(capital_percentage.5G < 100,cost-capital_allocated.5G,0),
capital_allocated.5G = capital_allocated.5G + ifelse(capital_percentage.5G < 100,diff(c(0, pmin(cumsum(not.yet.alloc), ann.invest))), 0),
capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(is.na(year) & capital_percentage.5G >= 50, paste0("Year.",y), year)) %>%
select(-cost,-not.yet.alloc)
}
annual.investment <- 500
all <- alloc.invest(all,annual.investment,1)
print(all)
all <- alloc.invest(all,annual.investment,2)
print(all)
all <- alloc.invest(all,annual.investment,3)
print(all)
On year 3, in the final investment allocation here, the capital_percentage.5G suddenly shoot up to 110%.
Updated for year-on-year costs that may increase or decrease
For different costs per year that may decrease per year as well as increase, we simply do not need to check if the capital_percentage.5G exceeded 100 percent when updating not.yet.alloc and capital_allocated.5G:
library(dplyr)
alloc.invest <- function(df, ann.invest, y) {
df %>% mutate_(cost=paste0("cost.",y)) %>%
mutate(capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(capital_percentage.5G < 50, NA, year),
not.yet.alloc = cost-capital_allocated.5G,
capital_allocated.5G = capital_allocated.5G + diff(c(0, pmin(cumsum(not.yet.alloc), ann.invest))),
capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(is.na(year) & capital_percentage.5G >= 50, paste0("Year.",y), year)) %>%
select(-cost,-not.yet.alloc)
}
With the new cost data:
observation <- c(1:10)
pop.d.rank <- c(1:10)
cost.1 <- c(101:110)
cost.2 <- c(110:101)
cost.3 <- c(100:91)
Augment with initial value columns as before:
capital_allocated.5G <- rep(0,10) ## initialize to zero
capital_percentage.5G <- rep(0,10) ## initialize to zero
year <- rep(NA,10) ## initialize to NA
all <- data.frame(observation,pop.d.rank,cost.1, cost.2, cost.3, capital_allocated.5G,capital_percentage.5G,year)
Year 1:
annual.investment <- 500
all <- alloc.invest(all,annual.investment,1)
print(all)
## observation pop.d.rank cost.1 cost.2 cost.3 capital_allocated.5G capital_percentage.5G year
##1 1 1 101 110 100 101 100.00000 Year.1
##2 2 2 102 109 99 102 100.00000 Year.1
##3 3 3 103 108 98 103 100.00000 Year.1
##4 4 4 104 107 97 104 100.00000 Year.1
##5 5 5 105 106 96 90 85.71429 Year.1
##6 6 6 106 105 95 0 0.00000 <NA>
##7 7 7 107 104 94 0 0.00000 <NA>
##8 8 8 108 103 93 0 0.00000 <NA>
##9 9 9 109 102 92 0 0.00000 <NA>
##10 10 10 110 101 91 0 0.00000 <NA>
Year 2:
all <- alloc.invest(all,annual.investment,2)
print(all)
## observation pop.d.rank cost.1 cost.2 cost.3 capital_allocated.5G capital_percentage.5G year
##1 1 1 101 110 100 110 100.00000 Year.1
##2 2 2 102 109 99 109 100.00000 Year.1
##3 3 3 103 108 98 108 100.00000 Year.1
##4 4 4 104 107 97 107 100.00000 Year.1
##5 5 5 105 106 96 106 100.00000 Year.1
##6 6 6 106 105 95 105 100.00000 Year.2
##7 7 7 107 104 94 104 100.00000 Year.2
##8 8 8 108 103 93 103 100.00000 Year.2
##9 9 9 109 102 92 102 100.00000 Year.2
##10 10 10 110 101 91 46 45.54455 <NA>
Year 3:
all <- alloc.invest(all,annual.investment,3)
print(all)
## observation pop.d.rank cost.1 cost.2 cost.3 capital_allocated.5G capital_percentage.5G year
##1 1 1 101 110 100 100 100 Year.1
##2 2 2 102 109 99 99 100 Year.1
##3 3 3 103 108 98 98 100 Year.1
##4 4 4 104 107 97 97 100 Year.1
##5 5 5 105 106 96 96 100 Year.1
##6 6 6 106 105 95 95 100 Year.2
##7 7 7 107 104 94 94 100 Year.2
##8 8 8 108 103 93 93 100 Year.2
##9 9 9 109 102 92 92 100 Year.2
##10 10 10 110 101 91 91 100 Year.3
The original issue with your code is that ifelse just provide a switch on the output based on the condition and not the input cost used within the TRUE branch of the ifelse. Therefore, cumsum(cost) computes the cumsum over all cost and not only on the portion of the TRUE branch of the ifelse. To fix this, we can define the following function that can then be executed for each year in turn.
library(dplyr)
alloc.invest <- function(df, ann.invest, y) {
df %>% mutate(not.yet.alloc = ifelse(capital_percentage.5G < 100,cost-capital_allocated.5G,0),
capital_allocated.5G = capital_allocated.5G + ifelse(capital_percentage.5G < 100,diff(c(0, pmin(cumsum(not.yet.alloc), ann.invest))), 0),
capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(is.na(year) & capital_percentage.5G >= 50, paste0("Year.",y), year)) %>%
select(-not.yet.alloc)
}
Note:
Create a new temporary column not.yet.alloc from which we compute the resulting cumsum for the year's allocation.
Don't need separate mutate statements.
Need to also check is.na(year) before setting year. Otherwise, previous year already labelled will be overwritten.
To use this function, we must first augment the input data with some initial values for capital_allocated.5G, capital_percentage.5G, and year:
capital_allocated.5G <- rep(0,10) ## initialize to zero
capital_percentage.5G <- rep(0,10) ## initialize to zero
year <- rep(NA,10) ## initialize to NA
all <- data.frame(observation,pop.d.rank,cost,capital_allocated.5G,capital_percentage.5G,year)
Then for Year 1:
annual.investment <- 500
all <- alloc.invest(all,annual.investment,1)
print(all)
## observation pop.d.rank cost capital_allocated.5G capital_percentage.5G year
##1 1 1 101 101 100.00000 Year.1
##2 2 2 102 102 100.00000 Year.1
##3 3 3 103 103 100.00000 Year.1
##4 4 4 104 104 100.00000 Year.1
##5 5 5 105 90 85.71429 Year.1
##6 6 6 106 0 0.00000 <NA>
##7 7 7 107 0 0.00000 <NA>
##8 8 8 108 0 0.00000 <NA>
##9 9 9 109 0 0.00000 <NA>
##10 10 10 110 0 0.00000 <NA>
and for Year 2:
all <- alloc.invest(all,annual.investment,2)
print(all)
## observation pop.d.rank cost capital_allocated.5G capital_percentage.5G year
##1 1 1 101 101 100 Year.1
##2 2 2 102 102 100 Year.1
##3 3 3 103 103 100 Year.1
##4 4 4 104 104 100 Year.1
##5 5 5 105 105 100 Year.1
##6 6 6 106 106 100 Year.2
##7 7 7 107 107 100 Year.2
##8 8 8 108 108 100 Year.2
##9 9 9 109 109 100 Year.2
##10 10 10 110 55 50 Year.2
Update to new requirement of changing costs per year
If costs are different per year, then the function needs to readjust the capital_percentage.5G and possibly the year columns first:
library(dplyr)
alloc.invest <- function(df, ann.invest, y) {
df %>% mutate_(cost=paste0("cost.",y)) %>%
mutate(capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(capital_percentage.5G < 50, NA, year),
not.yet.alloc = ifelse(capital_percentage.5G < 100,cost-capital_allocated.5G,0),
capital_allocated.5G = capital_allocated.5G + ifelse(capital_percentage.5G < 100,diff(c(0, pmin(cumsum(not.yet.alloc), ann.invest))), 0),
capital_percentage.5G = capital_allocated.5G / cost * 100,
year = ifelse(is.na(year) & capital_percentage.5G >= 50, paste0("Year.",y), year)) %>%
select(-cost,-not.yet.alloc)
}
Note that creating another temporary column cost using mutate_ is only for convenience as the cost column needs to be dynamically selected based on the input y (otherwise, we need to use mutate_ for all computations, which will be somewhat messier).
With the updated data similarly augmented with initial values for capital_allocated.5G, capital_percentage.5G, and year, Year 1:
annual.investment <- 500
all <- alloc.invest(all,annual.investment,1)
print(all)
## observation pop.d.rank cost.1 cost.2 cost.3 capital_allocated.5G capital_percentage.5G year
##1 1 1 101 102 103 101 100.00000 Year.1
##2 2 2 102 103 104 102 100.00000 Year.1
##3 3 3 103 104 105 103 100.00000 Year.1
##4 4 4 104 105 106 104 100.00000 Year.1
##5 5 5 105 106 107 90 85.71429 Year.1
##6 6 6 106 107 108 0 0.00000 <NA>
##7 7 7 107 108 109 0 0.00000 <NA>
##8 8 8 108 109 110 0 0.00000 <NA>
##9 9 9 109 110 111 0 0.00000 <NA>
##10 10 10 110 111 112 0 0.00000 <NA>
Year 2: Note that last asset has less than 50% allocated so its year is still NA.
all <- alloc.invest(all,annual.investment,2)
print(all)
## observation pop.d.rank cost.1 cost.2 cost.3 capital_allocated.5G capital_percentage.5G year
##1 1 1 101 102 103 102 100.00000 Year.1
##2 2 2 102 103 104 103 100.00000 Year.1
##3 3 3 103 104 105 104 100.00000 Year.1
##4 4 4 104 105 106 105 100.00000 Year.1
##5 5 5 105 106 107 106 100.00000 Year.1
##6 6 6 106 107 108 107 100.00000 Year.2
##7 7 7 107 108 109 108 100.00000 Year.2
##8 8 8 108 109 110 109 100.00000 Year.2
##9 9 9 109 110 111 110 100.00000 Year.2
##10 10 10 110 111 112 46 41.44144 <NA>
Year 3:
all <- alloc.invest(all,annual.investment,3)
print(all)
## observation pop.d.rank cost.1 cost.2 cost.3 capital_allocated.5G capital_percentage.5G year
##1 1 1 101 102 103 103 100 Year.1
##2 2 2 102 103 104 104 100 Year.1
##3 3 3 103 104 105 105 100 Year.1
##4 4 4 104 105 106 106 100 Year.1
##5 5 5 105 106 107 107 100 Year.1
##6 6 6 106 107 108 108 100 Year.2
##7 7 7 107 108 109 109 100 Year.2
##8 8 8 108 109 110 110 100 Year.2
##9 9 9 109 110 111 111 100 Year.2
##10 10 10 110 111 112 112 100 Year.3

Manual calculation of the Kaplan-Meier estimator

I thought this would be a trivial thing to do but I still have some trouble adjusting to writing code instead of pointing and clicking on a spreadsheet.
month = as.integer(c(1,2,3,4,5,6,7,8,9,10,11,12))
remaining = c(1000,925,852,790,711,658,601,567,530,501,485,466)
left = c(75, 73, 62, 79, 53, 57, 34, 37, 29, 16, 19, 0)
KPdata = data.frame(month, remaining, left)
> KPdata
month remaining left
1 1 1000 75
2 2 925 73
3 3 852 62
4 4 790 79
5 5 711 53
6 6 658 57
7 7 601 34
8 8 567 37
9 9 530 29
10 10 501 16
11 11 485 19
12 12 466 12
How do I calculate the Kaplan-Meier survival function at each month? Note that I want to do this manually, I am aware that there are packages which will do it for me.
I think this is what you're trying to do. We use lag and cumprod to get a manual KM estimator:
KPdata$KM_init <- lag((KPdata$remaining - KPdata$left) / KPdata$remaining)
KPdata[1,ncol(KPdata)] <- 1
KPdata$KM_final <- cumprod(KPdata$KM_init)
KPdata
month remaining left KM_init KM_final
1 1 1000 75 1.0000000 1.000
2 2 925 73 0.9250000 0.925
3 3 852 62 0.9210811 0.852
4 4 790 79 0.9272300 0.790
5 5 711 53 0.9000000 0.711
6 6 658 57 0.9254571 0.658
7 7 601 34 0.9133739 0.601
8 8 567 37 0.9434276 0.567
9 9 530 29 0.9347443 0.530
10 10 501 16 0.9452830 0.501
11 11 485 19 0.9680639 0.485
12 12 466 0 0.9608247 0.466
Alternatively, I think there's a different form of a KM estimator that looks like this (note that I've added a row corresponding to month = 0):
month = as.integer(c(0,1,2,3,4,5,6,7,8,9,10,11,12))
remaining = c(1000,1000,925,852,790,711,658,601,567,530,501,485,466)
left = c(0,75, 73, 62, 79, 53, 57, 34, 37, 29, 16, 19, 0)
KPdata2 = data.frame(month, remaining, left)
KPdata2$KM_init <- (KPdata2$remaining - KPdata2$left) / KPdata2$remaining
KPdata2$KM_final <- cumprod(KPdata2$KM_init)
KPdata2
month remaining left KM_init KM_final
1 0 1000 0 1.0000000 1.000
2 1 1000 75 0.9250000 0.925
3 2 925 73 0.9210811 0.852
4 3 852 62 0.9272300 0.790
5 4 790 79 0.9000000 0.711
6 5 711 53 0.9254571 0.658
7 6 658 57 0.9133739 0.601
8 7 601 34 0.9434276 0.567
9 8 567 37 0.9347443 0.530
10 9 530 29 0.9452830 0.501
11 10 501 16 0.9680639 0.485
12 11 485 19 0.9608247 0.466
13 12 466 0 1.0000000 0.466
I was so taken with the question and #bouncyball's inspiring answer, I thought I'd add my ha'penny worth with an attempt at handling censoring. This is intended to be in the spirit of the original question - doing things 'handraulically' to develop key insights.
## rename remaining -> survived; left -> died
month = as.integer(c(1,2,3,4,5,6,7,8,9,10,11,12))
survived = c(1000,925,852,790,711,658,601,567,530,501,485,466)
died = c(75, 73, 62, 79, 53, 57, 34, 37, 29, 16, 19, 0)
## arbitrary censoring # 10 per time period
censored <- c(0, rep(10,11))
KPdata3 = data.frame(month, at.risk, censored, died, survived)
## define those at risk <= those who survived
## awful bit of R fiddling for (something simple like) offsetting the index in base R
len <- length(month)
at.risk <- c(survived[1],
survived[-len] - died[-len] - cumsum(censored[-len]) )
## note use of cumsum()
## censoring uses at risk, rather than survived/remained
KPdata3$KM_increment <- (KPdata3$at.risk - KPdata3$died)/ KPdata3$at.risk
## code credit to #bouncyball
KPdata3$KM_cumulative <- cumprod(KPdata3$KM_increment)
KPdata3
Gives this.....
month at.risk censored died survived KM_increment KM_cumulative
1 1 1000 0 75 1000 0.9250000 0.9250000
2 2 925 10 73 925 0.9210811 0.8520000
3 3 842 10 62 852 0.9263658 0.7892637
4 4 770 10 79 790 0.8974026 0.7082873
5 5 681 10 53 711 0.9221733 0.6531636
6 6 618 10 57 658 0.9077670 0.5929203
7 7 551 10 34 601 0.9382940 0.5563336
8 8 507 10 37 567 0.9270217 0.5157333
9 9 460 10 29 530 0.9369565 0.4832197
10 10 421 10 16 501 0.9619952 0.4648551
11 11 395 10 19 485 0.9518987 0.4424949
12 12 366 10 0 466 1.0000000 0.4424949
Setting rep(0,11) gives the same answer as #bouncyball's.

Split intervals (genomic regions) in individual numbers (nucleotides)

I would like to transform my data frame df based in regions in point by point (number by number or nucletide by nucleotide) information.
My input df:
start end state freq
100 103 1nT 22
100 103 3nT 34
104 106 1nT 12
104 106 3nT 16
My expected output:
position state freq
100 1nT 22
101 1nT 22
102 1nT 22
103 1nT 22
100 3nT 34
101 3nT 34
102 3nT 34
103 3nT 34
104 1nT 12
105 1nT 12
106 1nT 12
104 3nT 16
105 3nT 16
106 3nT 16
Any ideas? Thank you very much.
Here is a vectorized approach:
# load your data
df <- read.table(textConnection("start end state freq
100 103 1nT 22
100 103 3nT 34
104 106 1nT 12
104 106 3nT 16"), header=TRUE)
# extract number of needed replications
n <- df$end - df$start + 1
# calculate position and replicate state/freq
res <- data.frame(position = rep(df$start - 1, n) + sequence(n),
state = rep(df$state, n),
freq = rep(df$freq, n))
res
# position state freq
# 1 100 1nT 22
# 2 101 1nT 22
# 3 102 1nT 22
# 4 103 1nT 22
# 5 100 3nT 34
# 6 101 3nT 34
# 7 102 3nT 34
# 8 103 3nT 34
# 9 104 1nT 12
# 10 105 1nT 12
# 11 106 1nT 12
# 12 104 3nT 16
# 13 105 3nT 16
# 14 106 3nT 16
Here is one approach....
Build you data
require(data.table)
fakedata <- data.table(start=c(100,100,104,104),
end=c(103,103,106,106),
state=c("1nT","3nT","1nT","3nT"),
freq=c(22,34,12,16))
Perform calculation
fakedata[ , dur := (end-start+1)]
outdata <- fakedata[ , lapply(.SD,function(x) rep(x,dur))]
outdata[ , position := (start-1)+1:.N, by=list(start,end,state)]
And the output
start end state freq dur position
1: 100 103 1nT 22 4 100
2: 100 103 1nT 22 4 101
3: 100 103 1nT 22 4 102
4: 100 103 1nT 22 4 103
5: 100 103 3nT 34 4 100
6: 100 103 3nT 34 4 101
7: 100 103 3nT 34 4 102
8: 100 103 3nT 34 4 103
9: 104 106 1nT 12 3 104
10: 104 106 1nT 12 3 105
11: 104 106 1nT 12 3 106
12: 104 106 3nT 16 3 104
13: 104 106 3nT 16 3 105
14: 104 106 3nT 16 3 106
This can be accomplished with a simple apply command.
Let's build this in sequence:
You want to perform an operation based on every row, so apply by row should be your first thought (or for loop). So we know we want to use apply(data, 1, row.function).
Think of what you would want to do for a single row. You want to repeat state and freq for every number between start and stop.
To get the range of numbers between start and stop we can use the colon operator start:stop.
Now, R will automatically repeat the values in a vector to match the longest vector length when creating a data.frame. So, we can create the piece from a single row like this:
data.frame(position=(row['start']:row['end']), state=row['state'], freq=row['freq'])
Then we want to bind it all together, so we use `do.call('rbind', result).
Putting this all together now, we have:
do.call('rbind',
apply(data, 1, function(row) {
data.frame(position=(row['start']:row['end']),
state=row['state'], freq=row['freq'])
}))
Which will give you what you want. Hopefully this helps teach you how to approach problems like this in the future too!
Here's rough implementation using for loop.
a = t(matrix(c(100, 103, "1nT" , 22,
100, 103 , "3nT" , 34,
104, 106 , "1nT" , 12,
104, 106 , "3nT" , 16), nrow = 4))
a = data.frame(a, stringsAsFactor = F)
colnames(a) = c("start", "end" , "state", "freq")
a$start = as.numeric(as.character(a$start))
a$end = as.numeric(as.character(a$end))
n = dim(a)[1]
res = NULL
for (i in 1:n) {
position = a$start[i]:a$end[i]
state = rep(a$state[i], length(position))
freq = rep(a$freq[i], length(position))
temp = cbind.data.frame(position, state, freq)
res = rbind(res, temp)
}

Resources