Related
I am a complete beginner when it comes to R but I am trying to analyze a friendship network and I want to color in the nodes according to what the persons status is, so either 0 = high status (maybe in yellow) or 1= low status (maybe in red).
I created the status variable with the data from wave 1, removed the NA and those that did not participate in both waves and created the network with the data of the second wave
w1$Status <- NA
w1$Status[!is.na(w1$diplomamother)|!is.na(w1$diplomafather)]<-1
w1$Status[w1$diplomamother!=2&w1$diplomafather!=2] <-0
#Is this neccessary?
statusw1 <- w1 %>%
select(Status, NW_ID)
statusw1[-c(1, 10, 12, 13, 17, 44, 50, 51, 52, 53, 54, 15, 16, 20, 22, 25, 33, 38, 57, 58, 59, 60, 62),]
statusw1 <- statusw1[-c(1, 10, 12, 13, 17, 44, 50, 51, 52, 53, 54, 15, 16, 20, 22, 25, 33, 38, 57, 58, 59, 60, 62),]
friendsw2 <- friendsw2 %>%
select(!grep(c("_15_|_16_|_20_|_22_|_25_|_33_|_38_|_57_|_58_|_59_|_60_|_62_"), colnames(friendsw2)))
friends_ma2 <- as.matrix(data.frame(friendsw2, row.names = "NW_ID"))
colnames(friends_ma2) <- rownames(friends_ma2)
friends_ma2[is.na(friends_ma2)] <- 0
friends_ma2[friends_ma2 == ""] <- 0
storage.mode(friends_ma2) <- "numeric"
friends_gr2 <- graph.adjacency(friends_ma2, mode = "directed", diag = FALSE)
plot(friends_gr2)
What do I have to do next to visualize the status?
I hope what I explained is understandable! Thank you for your help!
I've got a loop in my code that I would like to rewrite so running the code takes a little less time to compete. I know you allways have to avoid loops in the code but I can't think of an another way to accomplice my goal.
So I've got a dataset "df_1531" containing a lot of data that I need to cut into pieces by using subset() (if anyone knows a better way, let me know ;) ). I've got a vector with 21 variable names on which I like assign a subset of df_1531. Furthermore the script contains 22 variables with constrains (shift_XY_time).
So, this is my code now...
# list containing different slots
shift_time_list<- c(startdate, shift_1m_time, shift_1a_time, shift_1n_time,
shift_2m_time, shift_2a_time, shift_2n_time,
shift_3m_time, shift_3a_time, shift_3n_time,
shift_4m_time, shift_4a_time, shift_4n_time,
shift_5m_time, shift_5a_time, shift_5n_time,
shift_6m_time, shift_6a_time, shift_6n_time,
shift_7m_time, shift_7a_time, shift_7n_time)
# List with subset names
shift_sub_list <- c("shift_1m_sub", "shift_1a_sub", "shift_1n_sub",
"shift_2m_sub", "shift_2a_sub", "shift_2n_sub",
"shift_3m_sub", "shift_3a_sub", "shift_3n_sub",
"shift_4m_sub", "shift_4a_sub", "shift_4n_sub",
"shift_5m_sub", "shift_5a_sub", "shift_5n_sub",
"shift_6m_sub", "shift_6a_sub", "shift_6n_sub",
"shift_7m_sub", "shift_7a_sub", "shift_7n_sub")
# The actual loop that I'd like to rewrite
for (i in 1:21) {
assign(shift_sub_list[i], subset(df_1531, df_1531$'PLS FFM' >= shift_time_list[i] & df_1531$'PLS FFM' < shift_time_list[i+1]))
}
Running the loop takes approximately 6 or 7 seconds. So, if anyone knows a better/cleaner or quicker way to write my code, I desperately like to hear your suggestion/opinion.
**Reproducible example **
mydata <- cars
dput(cars)
structure(list(speed = c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11,
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16,
16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20,
22, 23, 24, 24, 24, 24, 25), dist = c(2, 10, 4, 22, 16, 10, 18,
26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80,
20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32,
48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)), class = "data.frame", row.names = c(NA,
-50L))
dist_interval_list <- c( 0, 5, 10, 15,
20, 25, 30, 35,
40, 45, 50, 55,
60, 65, 70, 75,
80, 85, 90, 95,
100, 105, 110, 115, 120)
var_name_list <- c("var_name_1a", "var_name_1b", "var_name_1c", "var_name_1d",
"var_name_2a", "var_name_2b", "var_name_2c", "var_name_2d",
"var_name_3a", "var_name_3b", "var_name_3c", "var_name_3d",
"var_name_4a", "var_name_4b", "var_name_4c", "var_name_4d",
"var_name_5a", "var_name_5b", "var_name_5c", "var_name_5d",
"var_name_6a", "var_name_6b", "var_name_6c", "var_name_6d")
for (i in 1:24){
assign(var_name_list[i], subset(mydata,
mydata$dist >= dist_interval_list[i] &
mydata$dist < dist_interval_list[i+1]))
}
Starting with the 'reproducible' part and the information that the final aim is to summarize another column, it is possible to exploit the fact that the intervals are non-overlapping and simply use the cut function.
library(tidyverse)
mydata %>%
mutate(interval = cut(dist, breaks = dist_interval_list)) %>%
group_by(interval) %>%
summarise(sum = sum(speed))
This should be much faster and will also help you not to get lost in a messy environment full of variables (which are actually part of your data). You want to keep all your data in a single data frame as long as possible;) You probably want to follow with something like purrrlyr::invoke_rows at the final modeling step, if your function does not work with data frames.
I have constructed models in glmer and would like to predict these on a rasterStack representing the fixed effects in my model. my glmer model is in the form of:
m1<-glmer(Severity ~ x1 + x2 + x3 + (1 | Year) + (1 | Ecoregion), family=binomial( logit ))
As you can see, I have random effects which I don't have as spatial layer - for example 'year'. Therefore the problem is really predicting glmer on rasterStacks when you don't have the random effects data random effects layers. If I use it out of the box without adding my random effects I get an error.
m1.predict=predict(object=all.var, model=m1, type='response', progress="text", format="GTiff")
Error in predict.averaging(model, blockvals, ...) :
Your question is very brief, and does not indicated what, if any, trouble you have encountered. This seems to work 'out of the box', but perhaps not in your case. See ?raster::predict for options.
library(raster)
# example data. See ?raster::predict
logo <- brick(system.file("external/rlogo.grd", package="raster"))
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
xy <- rbind(cbind(1, p), cbind(0, a))
v <- data.frame(cbind(pa=xy[,1], extract(logo, xy[,2:3])))
v$Year <- sample(2000:2001, nrow(v), replace=TRUE)
library(lme4)
m <- lmer(pa ~ red + blue + (1 | Year), data=v)
# here adding Year as a constant, as it is not a variable (RasterLayer) in the RasterStack object
x <- predict(logo, m, const=(data.frame(Year=2000)))
If you don't have the random effects, just use re.form=~0 in your predict call to predict at the population level:
x <- predict(logo, m, re.form=~0)
works without complaint for me with #RobertH's example (although I don't know if correctly)
I use randomForest model to predict class memberships. 'x' consists of 10 classes that I use to train 'training_predictors' values extracted from a large rasterstack/brick. The specific line of codes is:
r_tree<-randomForest(x ~. , data=training_predictors, ...)
Then I run 'predict' using the model 'r_tree' that I apply to the rasterstack 'predictor_data', as follow:
predictions<-predict(predictor_data, r_tree, filename=outraster, fun=predict na.rm=TRUE, format="PCDISK", overwrite=TRUE, progress="text", type="response").
The output is a raster that I use as thematic map.
I would like to use the conditional inference trees mode 'cforest' instead of randomForest to achieve the same goals.
I understand that 'predict' can be used with cforest, yet, I have not been able to generate raster files, such as those with randomForest as illustrated above.
It should run fine, but you may need to add the argument OOB=TRUE, and identify factors if there are any.
Example data
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
# extract values for points
xy <- rbind(cbind(1, p), cbind(0, a))
v <- data.frame(cbind(xy[,1], extract(logo, xy[,2:3])))
colnames(v)[1] <- 'pa'
Basic model
library(party)
m1 <- cforest(pa~., control=cforest_unbiased(mtry=3), data=v)
pc1 <- predict(logo, m1, OOB=TRUE)
plot(pc1)
Model with factors
v$red <- as.factor(round(v$red/100))
logo$red <- round(logo[[1]]/100)
m2 <- cforest(pa~., control=cforest_unbiased(mtry=3), data=v)
f <- list(levels(v$red))
names(f) <- 'red'
pc2 <- predict(logo, m2, OOB=TRUE, factors=f)
plot(pc2)
By the way, this comes almost straight out of the help file of raster::predict
I'm looking to take the derivative of the survival function in R and store it in a new function.
Here is my code so far:
install.packages("survival")
library(survival)
survival <- matrix(c(1, 555, 0, 82, 2, 473, 8, 30, 3, 435, 8, 27, 4, 400, 7, 22, 5,
371, 7, 26, 6, 338, 28, 25, 7, 285, 31, 20,8, 234, 32, 11, 9, 191,
24, 14, 10, 153, 27, 13, 11, 113, 22, 5, 12, 86, 23, 5, 13, 58, 18,
5, 14, 35, 9, 2, 15, 24, 7, 3, 16, 14, 11, 3),
ncol=4, byrow=TRUE)
year <- c()
for (i in 1:nrow(survival) ) year <- c(year, rep(i, survival[i, 4]))
for (i in 1:nrow(survival) ) year <- c(year, rep(i, survival[i, 3]))
state <- c(rep(1, sum(survival[, 4])), rep(0, sum(survival[, 3])))
my.surv <- Surv(year, state)
fit <- survfit(my.surv ~ 1)
my.fit <- survfit(my.surv ~ 1)
### K-M plot
plot(my.fit, main="Kaplan-Meier estimate with 95% confidence bounds",
xlab="time", ylab="survival function")
### K-M cumulative hazard function
H.hat <- -log(my.fit$surv)