calculate the sum per 2 columns - r

I have the following data frame:
all <- structure(list(counts = c(0L, 0L, 3L, 0L, 2L, 0L), counts = c(0L,
2L, 1L, 0L, 5L, 1L), counts = c(1L, 9L, 17L, 0L, 7L, 2L), counts = c(2L,
1L, 13L, 0L, 7L, 5L), counts = c(1L, 1L, 3L, 0L, 2L, 10L), counts = c(0L,
2L, 2L, 0L, 8L, 9L), counts = c(0L, 4L, 4L, 0L, 4L, 0L), counts = c(0L,
2L, 3L, 0L, 7L, 1L), counts = c(0L, 2L, 0L, 0L, 3L, 8L), counts = c(1L,
3L, 3L, 0L, 4L, 13L), counts = c(0L, 6L, 12L, 0L, 3L, 2L), counts = c(0L,
7L, 6L, 0L, 4L, 2L), counts = c(1L, 0L, 1L, 0L, 2L, 5L), counts = c(1L,
1L, 2L, 0L, 3L, 6L), counts = c(0L, 2L, 1L, 1L, 2L, 0L), counts = c(0L,
4L, 1L, 0L, 4L, 0L), counts = c(0L, 2L, 1L, 0L, 3L, 3L), counts = c(0L,
1L, 1L, 0L, 2L, 1L), counts = c(0L, 3L, 1L, 0L, 5L, 0L), counts = c(0L,
4L, 5L, 0L, 1L, 0L), counts = c(0L, 2L, 5L, 0L, 8L, 23L), counts = c(0L,
0L, 2L, 0L, 1L, 7L), counts = c(1L, 0L, 0L, 0L, 1L, 2L), counts = c(0L,
0L, 0L, 0L, 1L, 0L)), .Names = c("counts", "counts", "counts",
"counts", "counts", "counts", "counts", "counts", "counts", "counts",
"counts", "counts", "counts", "counts", "counts", "counts", "counts",
"counts", "counts", "counts", "counts", "counts", "counts", "counts"
), row.names = c("1/2-SBSRNA4", "A1BG", "A1BG-AS1", "A1CF", "A2LD1",
"A2M"), class = "data.frame")
In this dataframe i need the sum of every 2 columns in the simplest form this can be done with: all[1] + all[2], all[3] + all[4] etc etc. then at the end i could cbind the new frames again but i now this can be done with something like aggregate or apply. Only i did not yet manage to succeed. My best try now is: allfinal <-aggregate( all ,FUN = sum,by=[1:2] ) I know this is not how it should work but cant figure out how to correctly use aggregate or (s)apply to do this. Any tips are appreciated!
As output i want to have a dataframe that holds the sum of 2 columns per 1 columns. The data.frame now has 24 columns so at the end i need 12 columns.

you can try this:
t(rowsum(t(all), gl(ncol(all)/2, 2)))
hth

Related

How can I fix the runtime error in ecdf function in R?

When I run this code-
a<- read.delim(file.choose("data.txt"))
d<-sort(a$d)
plot(d, sort(ecdf(d)(d)),type="s", lty=2,col="red", ylab= "P(X<=x)",ylim= 0:1)
it makes me make this mistake-
Error in ecdf(d) : 'x' must have 1 or more non-missing values
help?
I ran your code and it seems to be alright. I've just changed the second line of your code, because the only column provided in your data was named as x, instead of d.
Check it out:
# load data
a = structure(list(x = c(4L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 4L, 1L, 2L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 3L, 0L, 5L, 2L, 2L, 1L, 0L, 0L, 2L, 0L, 0L,
0L, 1L, 3L, 3L, 0L, 0L, 0L, 2L, 0L, 2L, 1L, 1L, 4L, 4L,
0L, 1L, 3L, 1L, 0L, 2L, 1L, 2L, 0L, 0L, 0L, 1L, 0L, 1L,
6L, 0L, 2L, 2L, 0L, 1L, 1L, 2L, 1L, 0L, 1L, 0L, 3L, 0L,
3L, 0L, 4L, 3L, 2L, 2L, 2L, 1L, 3L, 0L, 3L, 2L, 0L, 1L,
2L, 1L)), class = "data.frame", row.names = c(NA, -100L))
# sort x column (the only column)
d = sort(a$x)
# plot
plot(d, sort(ecdf(d)(d)), type = "s", lty = 2, col = "red",
ylab = "P(X<=x)", ylim = 0:1)
Output:

Graph with two y axes in r

I was searching for a code to create a nice graph with two y axes. However i couldn't find a solution which helped me. (I am not using r often)
I have the time as x variable where i want to plot value1 and value2 on the right and left y achses respectively.
My data are in one dataset, so that i want to graph datasat$time on x axis and data$value1 on y left axis and data$value2 on right y axis. The way to illustrate should be a line with two different colours for each (data$value1 and data$value2). In addition there should be a legend for the lines.
Can someone help me with it?
Please, find my data w below.
Here is a script on how a dobbelt y-axis may be integrated using ggplot
ggplot() +
geom_bar(mapping = aes(x = w$WHO[w$Death==1]),
stat = "count", alpha=0.2, colour="#1C73C2",
fill="#ECF0F9") +
scale_y_continuous(name = "Number of deaths", breaks=yaks, sec.axis = sec_axis(~ . * 1 , name = "Mortality rate per 100 person-yrs", breaks=yaks, labels=c("0","5","10","15","20","25","30","35","40","45"))) +
scale_x_continuous(name="", breaks = c(1,2,3,4), labels =c("\nWHO-I\nn=37","\nWHO-II\nn=29","\nWHO-III\nn=19","\nUnknown\nn=25")) +
coord_cartesian(ylim=c(0, 18)) +
geom_point(mapping = aes(x = 1, y = 3.329993), size=5,alpha=0.7, shape=18, colour="red") +
geom_point(mapping = aes(x = 2, y = 12.424504), size=5,alpha=0.7, shape=18, colour="red") +
geom_point(mapping = aes(x = 3, y = 17.23519), size=5, alpha=0.7,shape=18, colour="red") +
geom_point(mapping = aes(x = 4, y = 4.549763), size=5, alpha=0.7, shape=18, colour="red") +
annotate("text", x = c(1,2,3,4) , y = c(3.329993+1.3, 12.424504+1.3, 17.23519+1.3,4.549763+1.3 ), label = c("8.3","31.1","43.1","11.4"), col="red", fontface=2, cex=4) +
theme(axis.text.y.right = element_text(color = "red", size = 11),
axis.title.y=element_text(color="darkgrey", size=11,face="bold", margin = margin(t = 0, r = 15, b = 0, l = 0)),
axis.text.y = element_text(color = "#1C73C2", size = 11),
axis.title.y.right=element_text(color="darkgrey", size=11,face="bold", margin = margin(t = 0, r = 0, b = 0, l = 15)),
axis.text.x = element_text(color = "grey20", size = 11))
My data w
w <- structure(list(WHO = c(1L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 1L, 2L,
3L, 3L, 3L, 1L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 4L, 4L, 1L, 4L, 1L, 2L, 1L, 4L, 1L, 4L, 4L, 4L,
4L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 2L, 1L, 2L, 2L, 4L, 4L,
4L, 2L, 4L, 1L, 4L, 4L, 2L, 4L, 4L, 3L, 4L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
3L, 4L, 3L, 4L, 3L), response = c(0L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, NA, 1L), Death = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, NA, 1L)), class = "data.frame", row.names = c(NA,
-111L))

ggplot2 with splitting by groups in R [duplicate]

This question already has answers here:
ggplot: colour points by groups based on user defined colours
(3 answers)
Closed 4 years ago.
I try to perform scatterplot between variables by two groups
ggplot(terr, aes(x = Killed, y = Terr..Attacks,group=Religion,Macro.Region)) +
geom_point() +
geom_smooth()
but i didn't get the results
how can i create scatterplot by groups?
terr=structure(list(Macro.Region = structure(c(5L, 4L, 4L, 3L, 4L,
6L, 1L, 2L, 4L, 3L, 6L, 5L, 4L, 4L, 3L, 4L, 6L, 1L, 2L, 4L, 3L,
6L), .Label = c("Arab Countries", "Asia", "Eastern Europe and post-Soviet",
"Latin America", "Sub-Saharan Africa", "Western States"), class = "factor"),
Killed = c(0L, 0L, 0L, 6L, 0L, 0L, 1L, 76L, 0L, 0L, 36L,
0L, 0L, 0L, 6L, 0L, 0L, 1L, 76L, 0L, 0L, 36L), Terr..Attacks = c(2L,
0L, 2L, 2L, 0L, 9L, 3L, 88L, 0L, 0L, 6L, 2L, 0L, 2L, 2L,
0L, 9L, 3L, 88L, 0L, 0L, 6L), Religion = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L), .Label = c("Christianity", "Islam"
), class = "factor"), GDP.capita = c(6813L, 26198L, 20677L,
9098L, NA, 49882L, 51846L, 4207L, 17508L, 18616L, 46301L,
6813L, 26198L, 20677L, 9098L, NA, 49882L, 51846L, 4207L,
17508L, 18616L, 46301L)), class = "data.frame", row.names = c(NA,
-22L))
ggplot(terr, aes(x = Killed, y = Terr..Attacks)) +
geom_point(alpha=1/4) +
facet_wrap(Religion ~ Macro.Region)

R: Recoding multiple dummy variables into a single variable and replacing the corresponding dummy value with the variable name

I have a dataset with 14 mutually exclusive categories of call type all coded as dummy variables. Here is a small sample:
dput(df)
structure(list(MON1_12 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), WEEK1_53 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AGENT_ID = structure(c(3L,
4L, 7L, 8L, 1L, 6L, 5L, 9L, 2L, 10L), .Label = c("A129", "A360",
"A407", "B891", "D197", "L145", "L722", "O518", "T443", "W764"
), class = "factor"), CallsHandled = c(1L, 4L, 2L, 14L, 1L, 2L,
5L, 1L, 1L, 3L), CONTENT = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), CLAIMS = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
CREDIT_CARD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
DEDUCT_BILL = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
HCREFORM = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("MON1_12",
"WEEK1_53", "AGENT_ID", "CallsHandled", "CONTENT", "CLAIMS",
"CREDIT_CARD", "DEDUCT_BILL", "HCREFORM"), class = "data.frame", row.names = c(NA,
-10L))
I want to combine each of the dummy variables into a single new variable called "QUEUE" that replaces the value of "1" with the name of the dummy variable its corresponding dummy variable. Here is an example of what this would look like:
dput(df2)
structure(list(MON1_12 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), WEEK1_53 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AGENT_ID = structure(c(3L,
4L, 7L, 8L, 1L, 6L, 5L, 9L, 2L, 10L), .Label = c("A129", "A360",
"A407", "B891", "D197", "L145", "L722", "O518", "T443", "W764"
), class = "factor"), CallsHandled = c(1L, 4L, 2L, 14L, 1L, 2L,
5L, 1L, 1L, 3L), QUEUE = structure(c(1L, 4L, 2L, 4L, 1L, 3L,
3L, 5L, 5L, 4L), .Label = c("CLAIMS", "CONTENT", "CREDIT_CARD",
"DEDUCT_BILL", "HCREFORM"), class = "factor")), .Names = c("MON1_12",
"WEEK1_53", "AGENT_ID", "CallsHandled", "QUEUE"), class = "data.frame", row.names = c(NA,
-10L))
Edit in response to having question marked down: This is what I had tried this afternoon on recommendation with a slightly different sample dataframe:
df$Queue <- as.factor(df$CONTENT + df$CLAIMS*2 + df$CREDIT_CARD*3 + df$DEDUCT_BILL*4 + df$HCREFORM*5)
levels(df$Queue) <- c("CONTENT", "CLAIMS", "CREDIT_CARD","DEDUCT_BILL","HCREFORM")
View(df)
But I received a column of NA's in the Queue column. So, I recreated another sample dataset here. This dataframe is adequately representative of what I'll receive in reality, except I'll have about 40 variables and 2 million rows. When I run what I tried above on "df" above I get the following incorrect result:
dput(df)
structure(list(MON1_12 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), WEEK1_53 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AGENT_ID = structure(c(3L,
4L, 7L, 8L, 1L, 6L, 5L, 9L, 2L, 10L), .Label = c("A129", "A360",
"A407", "B891", "D197", "L145", "L722", "O518", "T443", "W764"
), class = "factor"), CallsHandled = c(1L, 4L, 2L, 14L, 1L, 2L,
5L, 1L, 1L, 3L), CONTENT = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), CLAIMS = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
CREDIT_CARD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
DEDUCT_BILL = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
HCREFORM = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Queue = structure(c(2L,
1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("CONTENT",
"CLAIMS", "CREDIT_CARD", "DEDUCT_BILL", "HCREFORM"), class = "factor")), .Names = c("MON1_12",
"WEEK1_53", "AGENT_ID", "CallsHandled", "CONTENT", "CLAIMS",
"CREDIT_CARD", "DEDUCT_BILL", "HCREFORM", "Queue"), row.names = c(NA,
-10L), class = "data.frame")
I also tried:
df3 <- cbind(df[1:4], QUEUE = apply(df[5:9], 1, function(N) names(N)[as.logical(N)]))
but received the following error: "Error in data.frame("CLAIMS", character(0), character(0), "DEDUCT_BILL", :
arguments imply differing number of rows: 1, 0:
You could use max.col to get the column index that have a value of '1' in each row for columns 5 to 9. (The 'df' example is not correct as most of the rows were all 0s. The corrected one is below).
df$QUEUE <- names(df)[-c(1:4)][max.col(df[-c(1:4)])]
Or you can do
df$QUEUE <- names(df)[-(1:4)][(as.matrix(df[-(1:4)]) %*%
seq_along(df[-(1:4)]))[,1]]
Update
Based on the edit dataset 'df', some rows are all '0's for the columns 5:9, and in the expected result, it is showed that 'QUEUE' as 'CONTENT'. In that case, we can first modify the 'CONTENT' column to change the values where rows are all 0's and then apply either of the code above
df$CONTENT[!rowSums(df[5:9])] <- 1
df$QUEUE1 <- names(df)[5:9][max.col(df[5:9])]
df$QUEUE1
#[1] "CLAIMS" "CONTENT" "CONTENT" "DEDUCT_BILL" "CONTENT"
#[6] "CONTENT" "CONTENT" "CONTENT" "CONTENT" "CONTENT"
data
df <- structure(list(MON1_12 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), WEEK1_53 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
AGENT_ID = structure(c(3L,
4L, 7L, 8L, 1L, 6L, 5L, 9L, 2L, 10L), .Label = c("A129", "A360",
"A407", "B891", "D197", "L145", "L722", "O518", "T443", "W764"
), class = "factor"), CallsHandled = c(1L, 4L, 2L, 14L, 1L, 2L,
5L, 1L, 1L, 3L), CONTENT = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0), CLAIMS = c(1,
0, 0, 0, 1, 0, 0, 0, 0, 0), CREDIT_CARD = c(0, 0, 0, 0, 0, 1,
1, 0, 0, 0), DEDUCT_BILL = c(0, 1, 0, 1, 0, 0, 0, 0, 0, 1),
HCREFORM = c(0,
0, 0, 0, 0, 0, 0, 1, 1, 0)), .Names = c("MON1_12", "WEEK1_53",
"AGENT_ID", "CallsHandled", "CONTENT", "CLAIMS", "CREDIT_CARD",
"DEDUCT_BILL", "HCREFORM"), row.names = c(NA, -10L), class = "data.frame")
This should produce the desired result:
df2 <- cbind(df[1:4], QUEUE = apply(df[5:9], 1, function(N) names(N)[as.logical(N)]))
provided that only one and exactly one of the dummy variables is 1 in any of the rows (which is not true in your original sample of df).
Explanation: df[1:4] selects the columns one through four to be preserved in the output. It is then column bound to QUEUE using cbind function. QUEUE is obtained by iterating through the dummy variables (columns five through nine), row-wise over the data set df and selecting the column-name that contains the value one.

R software: error when using cozigam() function

I am modelling the potential distribution of a species using COZIGAM package. I have the response variable ("pb", which tells where the species is present) and the predictor variables (e.g. altitude, temperature, precipitation, etc).
When I run this formula:
# devtools::install_github('AndrewLJackson/COZIGAM')
coz.model <- cozigam(formula=pb ~ s(altitude) + s(combustible) + s(distribution) + s(e1) + s(e2) + s(e3) + s(euc.human) + s(euc.river) + s(fccarb) + s(fccmat) + s(forarb) + s(aspect) + s(slope) + s(precipitation) + s(radiation) + s(tipestr_class) + s(tipestr_forest) + s(tmean), data=sdmdata2, family=poisson)
it appears an error warning, which is:
Error in as.matrix(x) : object 'altitude' not found
However, when I run as.matrix(sdmdata2), 'altitude' variable exits in my matrix. The output of dput(head(sdmdata2)) is:
structure(list(X = 1:6, pb = c(2L, 2L, 2L, 2L, 2L, 2L), altitude = c(879L,
1094L, 1035L, 410L, 342L, 665L), combustible = c(6L, 6L, 3L,
0L, 3L, 3L), distribution = c(6L, 6L, 6L, 0L, 6L, 0L), e1 = c(4L,
4L, 2L, 0L, 4L, 0L), e2 = c(0L, 0L, 2L, 0L, 2L, 0L), e3 = c(0L,
0L, 4L, 0L, 2L, 0L), euc.human = c(790.569397, 3201.562012, 1750,
250, 250, 1952.562012), euc.river = c(0, 4069.705078, 353.5534058,
1030.776001, 559.0170288, 0), fccarb = c(90L, 70L, 40L, 0L, 30L,
0L), fccmat = c(5L, 10L, 35L, 0L, 60L, 80L), forarb = c(1L, 1L,
2L, 0L, 5L, 0L), aspect = c(6L, 8L, 6L, 4L, 3L, 3L), slope = c(5L,
3L, 5L, 2L, 6L, 5L), precipitation = c(87.01500702, 79.57628632,
81.86239624, 75.10630798, 49.58106995, 69.55927277), radiation = c(160.1408997,
163.4971008, 161.8542938, 157.9179993, 159.2113953, 160.6203003
), tipestr_class = c(1L, 1L, 1L, 7L, 1L, 2L), tipestr_forest = c(6L,
6L, 6L, 0L, 6L, 0L), tmean = c(141.7760925, 134.9530029, 141.9192047,
171.9972992, 186.2566986, 157.0391998)), .Names = c("X", "pb",
"altitude", "combustible", "distribution", "e1", "e2", "e3", "euc.human",
"euc.river", "fccarb", "fccmat", "forarb", "aspect", "slope",
"precipitation", "radiation", "tipestr_class", "tipestr_forest",
"tmean"), row.names = c(NA, 6L), class = "data.frame")
Do someone know what is the problem?

Resources