re-coding data into categories R - r

I have a column with some numbers, each corresponding to a primary condition of a patient. I want to recode this data into types of conditions ex. neurological, psychiatric etc
Can you please help as the code below is horrendous (and also not working)
October_data_UK$Primary_cat <- ifelse(October_data_UK$PRIMARY==2|October_data_UK$PRIMARY==5|October_data_UK$PRIMARY==7|October_data_UK$PRIMARY==10|October_data_UK$PRIMARY==13|October_data_UK$PRIMARY==14|October_data_UK$PRIMARY==16|October_data_UK$PRIMARY==23|October_data_UK$PRIMARY==24|October_data_UK$PRIMARY==27,"Chronic_pain",
ifelse(October_data_UK$PRIMARY==4|October_data_UK$PRIMARY==9|October_data_UK$PRIMARY==15|October_data_UK$PRIMARY==21|October_data_UK$PRIMARY==22|October_data_UK$PRIMARY==31|October_data_UK$PRIMARY==35|October_data_UK$PRIMARY==37|October_data_UK$PRIMARY==38, "Neurological",
ifelse(October_data_UK$PRIMARY==1|October_data_UK$PRIMARY==3|October_data_UK$PRIMARY==6|October_data_UK$PRIMARY==12|October_data_UK$PRIMARY==17|October_data_UK$PRIMARY==18|October_data_UK$PRIMARY==20|October_data_UK$PRIMARY==25|October_data_UK$PRIMARY==26,October_data_UK$PRIMARY==30|October_data_UK$PRIMARY==32|October_data_UK$PRIMARY==34|October_data_UK$PRIMARY==36,"Psychiatric",
ifelse(October_data_UK$PRIMARY==8|October_data_UK$PRIMARY==11|October_data_UK$PRIMARY==19|October_data_UK$PRIMARY==33|October_data_UK$PRIMARY==28|October_data_UK$PRIMARY==29|October_data_UK$PRIMARY==39,"Other",NA))))
I just want to write all the numbers together without having to repeat "October_data_UK$PRIMARY"

Try assigning the values with logical indices.
Chronic_pain <- c(2, 5, 7, 10, 13, 14, 16, 23, 24, 27)
Neurological <- c(4, 9, 15, 21, 22, 31, 35, 37, 38)
Psychiatric <- c(1, 3, 6, 12, 17, 18, 20, 25, 26, 30, 32, 34, 36)
Other <- c(8, 11, 19, 33, 28, 29, 39)
i1 <- October_data_UK$Primary_cat %in% Chronic_pain
i2 <- October_data_UK$Primary_cat %in% Neurological
i3 <- October_data_UK$Primary_cat %in% Psychiatric
#i4 <- October_data_UK$Primary_cat %in% Other
October_data_UK$Primary_cat <- NA_character_ # this creates the column
October_data_UK$Primary_cat[i1] <- "Chronic_pain"
October_data_UK$Primary_cat[i2] <- "Neurological"
October_data_UK$Primary_cat[i3] <- "Psychiatric"

Related

tidying igraph plot and routing or TSP question

I have less experience in R and I need help tidying my plot as it looks messy. Also, my project is to find the best minimal route from Seoul to every city and back to Seoul. It is almost like Traveling Salesman Problem (TSP) but there are some cities needed to be visited more than once as it is the only way to reach certain cities. I don't know how to do and what packages to use.
This is my code for igraph plot
library(igraph)
g1 <- graph( c("Seoul","Incheon","Seoul","Goyang","Seoul","Seongnam","Seoul",
"Bucheon","Seoul","Uijeongbu","Seoul","Gimpo",
"Seoul","Gwangmyeong", "Seoul", "Hanam","Seoul", "Guri",
"Seoul","Gwacheon","Busan","Changwon","Busan","Gimhae",
"Busan","Jeju","Busan","Yangsan","Busan","Geoje",
"Incheon","Goyang","Incheon","Bucheon","Incheon","Siheung",
"Incheon","Jeju","Incheon","Gimpo","Daegu","Gumi",
"Daegu","Gyeongsan","Daegu","Yeongcheon","Daejeon",
"Cheongju","Daejeon","Nonsan","Daejeon","Gongju",
"Daejeon","Gyeryong","Gwangju","Naju","Suwon","Yongin",
"Suwon","Seongnam","Suwon","Hwaseong","Suwon","Ansan",
"Suwon","Gunpo","Suwon","Osan","Suwon","Uiwang",
"Ulsan","Yangsan","Ulsan","Gyeongju","Ulsan","Miryang",
"Yongin","Seongnam","Yongin","Hwaseong","Yongin","Pyeongtaek",
"Yongin","Gwangju-si","Yongin","Icheon","Yongin","Anseong",
"Yongin","Uiwang","Goyang","Gimpo","Goyang","Paju","Goyang",
"Yangju","Changwon","Gimhae","Changwon","Jinju","Changwon",
"Miryang","Seongnam","Gwangju-si","Seongnam","Hanam","Seongnam",
"Uiwang","Seongnam","Gwacheon","Hwaseong","Ansan","Hwaseong",
"Pyeongtaek","Hwaseong","Gunpo","Hwaseong","Osan","Cheongju",
"Cheonan","Cheongju","Sejong","Bucheon","Siheung","Bucheon",
"Gwangmyeong","Ansan","Anyang","Ansan","Siheung","Ansan",
"Gunpo","Namyangju","Uijeongbu","Namyangju","Chuncheon",
"Namyangju","Hanam","Namyangju","Guri","Cheonan","Pyeongtaek",
"Cheonan","Sejong","Cheonan","Asan","Cheonan","Anseong",
"Jeonju","Gimje","Gimhae","Yangsan","Gimhae","Miryang",
"Pyeongtaek","Asan","Pyeongtaek","Osan","Pyeongtaek","Anseong",
"Pyeongtaek","Dangjin","Anyang","Siheung","Anyang","Gwangmyeong",
"Anyang","Gunpo","Anyang","Gwacheon","Siheung","Gwangmyeong",
"Siheung","Gunpo","Pohang","Yeongcheon","Pohang","Gyeongju",
"Jeju","Gimpo","Jeju","Mokpo","Jeju","Seogwipo","Uijeongbu",
"Yangju","Uijeongbu","Pocheon","Paju","Yangju","Gumi","Gimcheon",
"Gumi","Sangju","Gwangju-si","Hanam","Gwangju-si","Icheon",
"Gwangju-si","Yeoju","Sejong","Gongju","Wonju","Chungju",
"Wonju","Jecheon","Wonju","Yeoju","Jinju","Sacheon", "Yangsan",
"Miryang","Asan","Gongju","Iksan","Gunsan","Iksan","Nonsan",
"Iksan","Gimje","Chuncheon","Pocheon","Gyeongsan","Yeongcheon",
"Gunpo","Uiwang","Suncheon","Yeosu","Suncheon","Gwangyang",
"Gunsan","Gimje","Gyeongju","Yeongcheon","Geoje","Tongyeong",
"Osan","Anseong","Yangju","Pocheon","Yangju","Dongducheon",
"Icheon","Anseong","Icheon","Yeoju","Mokpo","Naju","Chungju",
"Jecheon","Chungju","Yeoju","Chungju","Mungyeong","Gangneung",
"Donghae","Gangneung","Sokcho","Seosan","Dangjin","Andong",
"Yeongju","Pocheon","Dongducheon","Gimcheon","Sangju","Tongyeong",
"Sacheon","Nonsan","Gongju","Nonsan","Boryeong","Nonsan",
"Gyeryong","Gongju","Boryeong","Gongju","Gyeryong","Jeongeup",
"Gimje","Yeongju","Mungyeong","Yeongju","Taebaek","Sangju",
"Mungyeong","Sokcho","Samcheok","Samcheok","Taebaek",
"Suncheon","Gwangju"), directed=F)
E(g1)$distance <- c(27, 16, 20, 19, 20, 24, 14, 20, 15, 15, 36, 18, 299, 18, 53,
25, 8, 12, 440, 18, 36, 13, 33, 33, 31, 26, 15, 20, 13, 20,
19, 18, 13, 16, 10, 33, 36, 51, 24, 31, 28, 21, 23, 27, 22,
11, 12, 24, 18, 52, 27, 11, 13, 19, 13, 14, 34, 20, 23, 38,
18, 12, 9, 12, 7, 10, 19, 53, 11, 8, 20, 27, 11, 26, 24, 18,
33, 25, 18, 15, 44, 14, 12, 4, 5, 12, 12, 37, 21, 458, 146,
27, 10, 23, 24, 21, 36, 14, 23, 36, 21, 39, 33, 26, 20, 32,
40, 20, 29, 18, 47, 24, 4, 27, 19, 22, 29, 17, 24, 18, 13,
32, 18, 37, 28, 43, 51, 33, 56, 20, 28, 12, 30, 38, 29, 47,
17, 47, 22, 26, 46, 51, 20, 10, 36,63)
plot(g1, edge.label=E(g1)$distance,
vertex.label.cex=0.6, vertex.size=4)
igraph plot
Using trick from https://or.stackexchange.com/questions/5555/tsp-with-repeated-city-visits
library(data.table)
library(purrr)
library(TSP)
library(igraph)
We need to create distance matrix based on shortest paths for each pair of vertices:
vertex_names <- names(V(g1))
N <- length(vertex_names)
dt <- map(
head(seq_along(vertex_names), -1),
~data.table(
from = vertex_names[[.x]],
to = vertex_names[(.x+1):N],
path = map(
shortest_paths(g1, vertex_names[[.x]], vertex_names[(.x+1):N])[["vpath"]],
names
)
),
) %>%
rbindlist()
then we calculate distances of shortest paths:
m <- as_adjacency_matrix(g1, type = "both", attr = "distance", sparse = FALSE)
dt[, weight := map_dbl(path, ~sum(m[embed(.x, 2)[, 2:1, drop=FALSE]]))]
now we assemble new matrix:
dt <- rbind(
dt, dt[, .(from = to, to = from, path = map(path, rev), weight = weight)]
)
new_m <- matrix(0, N, N)
rownames(new_m) <- colnames(new_m) <- vertex_names
new_m[as.matrix(dt[, .(from,to)])] <- dt[["weight"]]
on this new matrix we use some heuristic to solve TSP (for exact solution you should use method="concorde"):
res <- new_m %>%
TSP() %>%
solve_TSP(repetitions = 1000, two_opt = TRUE)
now we exchange each pair of consecutive cities with shortest path:
start_city <- "Seoul"
path_dt <- c(start_city, labels(cut_tour(res, start_city)), start_city) %>%
embed(2) %>%
.[,2:1,drop = FALSE] %>%
"colnames<-"(c("from", "to")) %>%
as.data.table()
path_dt <- dt[path_dt, on = .(from ,to)]
my_path <- c(unlist(map(path_dt[["path"]], head, -1)), start_city)
my_path is heuristic solution with distance tour_length(res)

Choose a subsample of random numbers

I will play in the Brazilian Lottery with my friends. I requested every one of them to choose seven numbers. I create a variable for all of them.
pestana = c(04, 15, 29, 36, 54, 25, 07)
carol = c(7, 22, 30, 35, 44, 51, 57)
davi = c(8, 13, 21, 29, 37, 42, 55)
valerio = c(30, 20, 33, 14, 7, 41, 54)
victor = c(09, 11, 26, 33, 38, 52, 57)
Then, I created a list with all of the numbers, and a list with unique numbers (in order to avoid repeated numbers)
list = c(carol, davi, pestana, valerio, victor, diuli, cynara)
list2 = unique(list)
Finally, I made a sample() for the list2
sample(list2, 7)
After that, I was wondering. Is it possible for me not to use the unique and not have repeated numbers? Because for instance, that way, repeated numbers have the same probability of appearing, when in fact, they have more (for instance, seven appeared three times).
How about this:
pestana = c(04, 15, 29, 36, 54, 25, 07)
carol = c(7, 22, 30, 35, 44, 51, 57)
davi = c(8, 13, 21, 29, 37, 42, 55)
valerio = c(30, 20, 33, 14, 7, 41, 54)
victor = c(09, 11, 26, 33, 38, 52, 57)
list = c(carol, davi, pestana, valerio, victor)
l <- c(unlist(list))
nums <- table(l)
probs <- nums/sum(nums)
sample(names(probs), 7, prob = probs, replace=FALSE)
#> [1] "4" "33" "44" "11" "29" "52" "8"
Created on 2022-12-14 by the reprex package (v2.0.1)
Using the prob argument, you can make some values more likely to show up than others.

How can I extract a vector from a list of text files

I have many different text files with the same structure (900*600 pixels). Now I would like to extract 900*600 vectors each containing one data point from each text file.
For example I would like to have a vector from the position (x1,y1) with all the data points from all the text files.
Here you can see my code I have in order to generate a list with all the text files.
file.list = list.files(pattern="*.txt", full.names=T)
df = data.frame( files= sapply(file.list, FUN = function(x)readChar(x, file.info(x)$size)), stringsAsFactors=FALSE)
Now "df" is a list containing all the text files.
How can I extract now the different vectors with values from all the files?
This is my code so far. I need to define somehow a function (FUN).
files = lapply(df, FUN, header = F, sep="\t", skip = 2, stringsAsFactors = F)
I prepared a dummy data set.
a = matrix(c(15, 12, 37, 21, 37, 26, 33, 33, 27, 38, 32, 21, 24, 18,
20, 14, 32, 56, 16, 7, 23, 14, 34, 42), nrow = 3, ncol = 4)
b = matrix(c(14, 18, 34, 26, 37, 26, 32, 36, 21, 39, 32, 21, 22, 18,
20, 16, 42, 50, 16, 7, 23, 12, 36, 40), nrow = 3, ncol = 4)
c = matrix(c(10, 12, 34, 29, 31, 26, 30, 30, 20, 38, 36, 21, 29, 18,
20, 10, 32, 59, 16, 1, 23, 10, 39, 49), nrow = 3, ncol = 4)
file.list = list(a,b,c)
Here every variable corresponds to one textfile (listed in file.list). And instead of a 900*600 matrix there are 3*4 matrices.
Accordingly to your suggestions I implemented the the functions the following way.
cmbn = expand.grid(1:3, 1:4)
flen = length(file.list)
lapply(1:(nrow(cmbn)),function(t,lst,cmbn){
return(sapply(1:flen,function(i,t1,lst1,cmbn1){
return(lst1[[i]][cmbn1$Var1[t1],cmbn1$Var2[t1]])},t,lst,cmbn))}
,file.list,cmbn)
This should work for you:
It will take two loops. Not sure if this is the most optimized solution.
cmbn is the data.frame of co-ordinates.
cmbn = expand.grid(1:3,1:4)
#or `expand.grid(1:900,1:600)` in your case
flen = length(file.list)
lst will take file.list
lapply(1:(nrow(cmbn)),function(t,lst,cmbn)
{return(sapply(1:flen,function(i,t1,lst1,cmbn1){
return(lst1[[i]][cmbn1$Var1[t1],cmbn1$Var2[t1]])},t,lst,cmbn))
},file.list,cmbn)

print maximum values of each pair of columns

I am sorry if I am asking the question if it has already been asked, but I could not find it..
AGE<-c(25, 37, 57, 72, 48, 28, 31, 57, 43, 38)
LLS<-c(24, 1, 24, 24, 14, 21, 4, 12, 8, 1)
RLS<-c(11, 1, 14, 21, 7, 21, 22, 8, 27, 12)
dat <- data.frame(AGE, LLS, RLS)
and want to get the maximum values of column LLS AND RLS for each rows.
Please can you tell me how to do it?
Thanks.
You can try pmax
do.call(pmax, dat[-1])
#[1] 24 1 24 24 14 21 22 12 27 12
If this is for each pair of columns, you can use combn
res <- combn(names(dat),2, FUN=function(x) do.call(pmax,dat[x]))
colnames(res) <- apply(combn(names(dat),2),2, paste, collapse="_")
I believe that for each row, you want to return a single value, whichever is higher of RLS or LLS. Right?
If so, Akrun's answer is good. Alternatively you can use the handy rowMaxs() function in the matrixStats package. In my opinion it's a little more straightforward, but that's the only real advantage.
Here is code, you can combine into fewer steps, but I wrote it out to make it clear what is going on.
AGE<-c(25, 37, 57, 72, 48, 28, 31, 57, 43, 38)
LLS<-c(24, 1, 24, 24, 14, 21, 4, 12, 8, 1)
RLS<-c(11, 1, 14, 21, 7, 21, 22, 8, 27, 12)
dat <- data.frame(AGE, LLS, RLS)
Create a subset of your dataframe, including only the columns you want
dat2 <- dat[,2:3]
Turn the new dataframe into a matrix so rowMaxs() doesn't complain
dat3 <- as.matrix(dat2)
Load the matrixStats package and call rowMaxs()
library(matrixStats)
rowMaxs(dat3)
[1] 24 1 24 24 14 21 22 12 27 12

Taking the derivative of a Survival Function in R

I'm looking to take the derivative of the survival function in R and store it in a new function.
Here is my code so far:
install.packages("survival")
library(survival)
survival <- matrix(c(1, 555, 0, 82, 2, 473, 8, 30, 3, 435, 8, 27, 4, 400, 7, 22, 5,
371, 7, 26, 6, 338, 28, 25, 7, 285, 31, 20,8, 234, 32, 11, 9, 191,
24, 14, 10, 153, 27, 13, 11, 113, 22, 5, 12, 86, 23, 5, 13, 58, 18,
5, 14, 35, 9, 2, 15, 24, 7, 3, 16, 14, 11, 3),
ncol=4, byrow=TRUE)
year <- c()
for (i in 1:nrow(survival) ) year <- c(year, rep(i, survival[i, 4]))
for (i in 1:nrow(survival) ) year <- c(year, rep(i, survival[i, 3]))
state <- c(rep(1, sum(survival[, 4])), rep(0, sum(survival[, 3])))
my.surv <- Surv(year, state)
fit <- survfit(my.surv ~ 1)
my.fit <- survfit(my.surv ~ 1)
### K-M plot
plot(my.fit, main="Kaplan-Meier estimate with 95% confidence bounds",
xlab="time", ylab="survival function")
### K-M cumulative hazard function
H.hat <- -log(my.fit$surv)

Resources