assign value to a variable rather than using if statement - r

Right now, I have dataset consisting of variables Gbcode and ncnty
> str(dt)
'data.frame': 840 obs. of 8 variables:
$ Gbcode : Factor w/ 28 levels "11","12","13",..: 21 22 23 24 25 26 27 28 16 17 ...
$ ncounty : num 0 0 0 0 0 0 0 0 0 0 ...
I want to do the following thing:
if a data record is with Gbcode equal to 11, then assign 20 to its ncnty
Gbcode : 11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33
Corresponding ncnty: 20, 19, 198, 131, 112, 102, 60, 145, 22, 115, 95
I am wondering whether there is any better solution rather than write an if statement, which would be with many lines in this case, maybe less than 20 lines of code.

This is a merge operation as far as I can tell. Make a little lookup table with your Gbcode/ncnty data, and then merge it in.
# lookup table
lkup <- data.frame(Gbcode=c(11,12,13),ncnty=c(20,19,198))
#example data
dt <- data.frame(Gbcode=c(11,13,12,11,13,12,12))
dt
# Gbcode
#1 11
#2 13
#3 12
#4 11
#5 13
#6 12
#7 12
Merge:
merge(dt, lkup, by="Gbcode", all.x=TRUE)
# Gbcode ncnty
#1 11 20
#2 11 20
#3 12 19
#4 12 19
#5 12 19
#6 13 198
#7 13 198
It is sometimes preferable to use match for this sort of thing too:
dt$ncnty <- lkup$ncnty[match(dt$Gbcode,lkup$Gbcode)]

This could be more elegant, but should do the trick.
Gbcodes <- as.character(c(11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33))
ncounties <- c(20, 19, 198, 131, 112, 102, 60, 145, 22, 115, 95)
for(i in 1:length(Gbcodes)) dt$ncounty[dt$Gbcode==Gbcodes[i]] <- dt$ncounties[i]

Related

How to create a new column based on other columns with if conditions in r

Not able to find a way to generate a new column based with if conditions for group of events in a column.
The column called "BF" represent the (i-3) of the flow column, and is going to be the same BF for each "event" group. For example, in row 5, the value of "BF" is 39, which is the previous 3rd value of the flow column (flow for row 2) for all the "2" in the event column.
The problem is that BF[i] can't be bigger than flow[i]. If BF[i] is bigger than flow[i], then the BF should be the (i-4) or (i-5) or (1-6)... of the flow until BF[i] will be equal or smaller than flow[i]. For example, in row 10 the value of the column "BF" is bigger than the value of the column "flow", therefore, the value of BF_1 (column I want to create) in row 10 is 37, which represent the closest lower value of flow, in this case the flow[i-6].
As an example, we have the following dataframe:
flow<- c(40, 39, 38, 37, 50, 49, 46, 44, 43, 45, 40, 30, 80, 75, 50, 55, 53, 51, 49, 100)
event<- c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4,5,5,5,5,6)
BF<- c(NA, NA, NA, NA, 39, 39, 39, 39, 39, 46, 46, 46, 45, 45, 45, 80, 80, 80, 80, 53)
a<- data.frame(flow, event, BF)
This is the desire output I'm looking for. I want to create the BF_1 column.
flow event BF BF_1
1 40 1 NA NA
2 39 1 NA NA
3 38 1 NA NA
4 37 1 NA NA
5 50 2 39 39
6 49 2 39 39
7 46 2 39 39
8 44 2 39 39
9 43 2 39 39
10 45 3 46 37
11 40 3 46 37
12 30 3 46 37
13 80 4 45 45
14 75 4 45 45
15 50 4 45 45
16 55 5 80 30
17 53 5 80 30
18 51 5 80 30
19 49 5 80 30
20 100 6 53 53
Is there a possible way to generate the column BF_1? please let me know any thoughts. I am working with for loops and using if conditions but I am not able to hold the BF value for the entire group of the event column.
coding a bit inefficient, could have use dplyr etc.., but it will do the work and matching the BF_1 column given
flow <- c(40, 39, 38, 37, 50, 49, 46, 44, 43, 45, 40, 30, 80, 75, 50, 55, 53, 51, 49, 100)
event <- c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4,5,5,5,5,6)
BF <- c(NA, NA, NA, NA, 39, 39, 39, 39, 39, 46, 46, 46, 45, 45, 45, 80, 80, 80, 80, 53)
a <- data.frame(flow, event, BF)
a$BF_1 <- NA #default to NA first
for(i in 1:length(unique(a$event))){
if(is.na(a[a$event == i, "BF"][1])) next
if(a[a$event == i, "BF"][1] < a[a$event == i, "flow"][1]) a[a$event == i, "BF_1"] <- a[a$event == i, "BF"][1]
if(a[a$event == i, "BF"][1] > a[a$event == i, "flow"][1]) {
head <- min(which(a$event==i))-6
if (min(head-6) < 0) head <- 1 #making sure it doesn't overflow to row 0
a[a$event == i, "BF_1"] <- min( a[ head:min(which(a$event==i)), "flow"] ) #fill the min of the subset flow column given position
}
}
a
One tidyverse possibility could be:
a %>%
left_join(crossing(a, a) %>%
filter(event > event1) %>%
group_by(event) %>%
filter(flow == first(flow)) %>%
slice(1:(n() - 3)) %>%
slice(which.max(cumsum(flow > flow1))) %>%
ungroup() %>%
transmute(event,
flow_flag = flow1), by = c("event" = "event")) %>%
mutate(BF_1 = ifelse(lag(flow, 3) > flow, flow_flag, lag(flow, 3))) %>%
group_by(event) %>%
mutate(BF_1 = first(BF_1)) %>%
select(-flow_flag)
flow event BF BF_1
<dbl> <dbl> <dbl> <dbl>
1 40 1 NA NA
2 39 1 NA NA
3 38 1 NA NA
4 37 1 NA NA
5 50 2 39 39
6 49 2 39 39
7 46 2 39 39
8 44 2 39 39
9 43 2 39 39
10 45 3 46 37
11 40 3 46 37
12 30 3 46 37
13 80 4 45 45
14 75 4 45 45
15 50 4 45 45
16 55 5 80 30
17 53 5 80 30
18 51 5 80 30
19 49 5 80 30
20 100 6 53 53
It could be overcomplicated, but what it does is, first, creating all combinations of values (as the desired value can be theoretically anywhere in the data). Second, it identifies the first case per group fulfilling the condition (not taking into account the previous 3rd value). Finally, it combines it with the original df and if the 3rd previous value per group is fulfilling the condition, then returns it, otherwise returns the value first fulfilling condition to be smaller than the actual value.

Algorithm to Ascribe Value RStudio

I am trying to create an algorithm that essentially is a function of this data frame.
This is the code I was using, but it doesn't seem to be working.
I need image_id to be the independent variable so that when I input 7 into the function, I get back 10 and 15. If I were to input 8, I would get back 11 and 13.
num = function(image_id, category_id, data = categories) {x->y}
This is the data frame that I am using.
category_id image_id cat_to_img_last_update
1 15 15 NULL
2 11 11 NULL
3 13 13 NULL
4 10 10 NULL
5 35 35 NULL
6 78 78 NULL
7 112 112 NULL
8 61 61 NULL
9 86 86 NULL
10 101 101 NULL
11 61 61 NULL
12 86 86 NULL
You probably don't need a function for this, but if you really want, here is what it would look like:
# Read in data
categories <-
data.frame(category_id = c(15,11,13,10,35,78,112,61,86,101,61,86),
image_id = c(7,8,8,7,9,9,10,10,11,11,12,12),
stringsAsFactors = FALSE)
num <- function(image_id, data = categories) {
data$category_id[data$image_id == image_id]
}
num(7) # 15 10
num(8) # 11 13
df <- data.frame(
category_id = c(15, 11, 13, 10, 25, 78, 112, 61, 86, 101, 61, 86),
image_id = c(7, 8, 8, 7, 9, 9, 10, 10, 11, 11, 12, 12)
)
myfun <- function(num) { sort(df[df$image_id == num, "category_id"]) }
myfun(7)
myfun(8)

R xts object - subset data points for 5 consecutive seconds

I have a large xts object and want to subset the seconds in the time column, but only if there is a sequence of minimum 5 consecutive seconds. I have up to 8 data points per second (which shouldn't be counted as 5 consecutive points as they are measured within the same second).
And_sub_xts is my xts object
> str(And_sub_xts)
An ‘xts’ object on 2010-04-09 20:32:56/2010-04-26 06:56:57 containing:
Data: chr [1:164421, 1:11] "0.255416" "0.168836" "0.212126" "0.229442" "0.238100" "0.212126" "0.168836" ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:11] "CalSurge" "CalSway" "CalHeave" "Stat_Surge" ...
Indexed by objects of class: [POSIXct,POSIXt] TZ:
xts Attributes:
NULL
and the first 100 values for
abs(diff(.indexsec(And_sub_xts)) are
56 8 23 34 40 40 41 42 25 27 34 35 38 38 40 40 41 56 59 59 19 19 20 20 20 20 22 22 23 23 24 24 24 25 25 26 27 27 27 27 27 28 28 30 30 30 37 38 40 40 41 44 44 46 46 47 48 51 52 54 54 54 54 55 56 59 1 4 4 4 6 6 6 6 7 7 11 12 12 14 14 15 16 16 17 18 18 19 19 21 21 22 22 23 23 25 25 26 26 26
I marked the keeps in bold, so the subset should just consist of these data points.
I just realize that theorethically it could happen that there are some data points distributed like this
2010-04-09 20:32:20
2010-04-09 20:32:20
2010-04-09 20:32:21
2010-04-09 20:32:22
2010-04-09 20:32:22
2010-04-09 20:40:22
2010-04-09 22:52:23
2010-04-10 20:52:24
which wouldn't be 5 consecutive seconds, but you can't account for this with the .indexsec command - maybe anybody knows a way to go around this.
Thanks for your help!
Here's one way to do it. x is sample data that contains index values with seconds equal to your first 100 values.
require(xts)
# sample data
s <- c(56, 8, 23, 34, 40, 40, 41, 42, 25, 27, 34, 35, 38, 38, 40,
40, 41, 56, 59, 59, 19, 19, 20, 20, 20, 20, 22, 22, 23, 23, 24,
24, 24, 25, 25, 26, 27, 27, 27, 27, 27, 28, 28, 30, 30, 30, 37,
38, 40, 40, 41, 44, 44, 46, 46, 47, 48, 51, 52, 54, 54, 54, 54,
55, 56, 59, 1, 4, 4, 4, 6, 6, 6, 6, 7, 7, 11, 12, 12, 14, 14,
15, 16, 16, 17, 18, 18, 19, 19, 21, 21, 22, 22, 23, 23, 25, 25,
26, 26, 26)
S <- cumsum(ifelse(c(0, diff(s)) < 0, 1, 0)) * 60 + s
x <- .xts(seq_along(S), S, tzone="UTC")
The basic idea is to aggregate your data to 1-second resolution, so you can use rle (run-length encoding) to find the consecutive 5-second observations. Then find the first and last timestamps of the sets of 5-second observations in your aggregated data, and then find the locations of those timestamps in your original data. Finally, use the locations of the timestamps in your original data to create sets of sequences you can use to subset the consecutive 5-second groups of observations.
# aggregate data to 1-second resolution
oneSec <- period.apply(x, endpoints(x, 'seconds'), identity)
# find the runs of 5 or more consecutive one-second increments
consec <- rle(diff(.index(oneSec)))
gte5s <- consec$lengths >= 5
# get the location of the first obs of the run in the 1-second data
begLoc <- cumsum(c(1,consec$lengths))[gte5s]
endLoc <- begLoc + consec$lengths[gte5s]
# get the timestamp of the first and last obs from the original data
beg <- lapply(index(oneSec)[begLoc], function(i) first(x[i, which.i=TRUE]))
end <- lapply(index(oneSec)[endLoc], function(i) last(x[i, which.i=TRUE]))
# create index vector between each value in 'beg' and 'end'
loc <- unlist(mapply(seq, beg, end))
# subset original object using index vector
X <- x[loc,]

Making sets from numbers in a dataframe

I have this data.frame:
structure(list(X0 = c(9, 13, 13, 13, 35, 36, 37, 38, 39, 40,
40, 42, 43, 44), X0.1 = c(10, 40, 45, 46, 36, 37, 38, 40, 46,
45, 46, 43, 44, 46)), .Names = c("A", "B"), row.names = c(NA,
14L), class = "data.frame")
A B
1 9 10
2 13 40
3 13 45
4 13 46
5 35 36
6 36 37
7 37 38
8 38 40
9 39 46
10 40 45
11 40 46
12 42 43
13 43 44
14 44 46
I want to create sets like this: row 2,3 and 4 have 13, so they will be grouped into a set (13,40,45,46).
If any further row has even one member common with this set, both members of that row will be included in this set.
Since row 8 has 40 common with above set, the set will include them also: (13,40,45,46,38)
Now row 7 now has one member (38) common with this set, other member (37) will also be included in this set. The set will become (13,40,45,46,38,37)
If none of the 2 members of a row are common to any existing set, they will form their own set. Like row 1 has 9 and 10, none of which is there in any other row. So they form one set of (9,10)
At end I want to print out all sets.
Can I accompalish this in R programming? Thanks for your help.
Is this what you want?
f <- function(s, v) {
m <- which(s$A %in% v | s$B %in% v)
if (!any(m)) v
else Recall(s[-m, ], sort(unique(c(v, c(unlist(s[m, ]))))))
}
done <- c()
for(n in unique(unlist(d))) {
if (n %in% done) next
r <- f(d, n)
done <- c(done, r)
cat("(", r, ") ")
}
it outputs
( 9 10 ) ( 13 35 36 37 38 39 40 42 43 44 45 46 )
Updated
done <- c()
ret <- list()
for(n in unique(unlist(d))) {
if (n %in% done) next
r <- f(d, n)
done <- c(done, r)
cat("(", r, ") ")
ret <- c(ret, list(r))
}
then,
> ret
[[1]]
[1] 9 10
[[2]]
[1] 13 35 36 37 38 39 40 42 43 44 45 46

How to generate a frequency table in R with with cumulative frequency and relative frequency

I'm new with R. I need to generate a simple Frequency Table (as in books) with cumulative frequency and relative frequency.
So I want to generate from some simple data like
> x
[1] 17 17 17 17 17 17 17 17 16 16 16 16 16 18 18 18 10 12 17 17 17 17 17 17 17 17 16 16 16 16 16 18 18 18 10
[36] 12 15 19 20 22 20 19 19 19
a table like:
frequency cumulative relative
(9.99,11.7] 2 2 0.04545455
(11.7,13.4] 2 4 0.04545455
(13.4,15.1] 1 5 0.02272727
(15.1,16.9] 10 15 0.22727273
(16.9,18.6] 22 37 0.50000000
(18.6,20.3] 6 43 0.13636364
(20.3,22] 1 44 0.02272727
I know it should be simple, but I don't know how.
I got some results using this code:
factorx <- factor(cut(x, breaks=nclass.Sturges(x)))
as.matrix(table(factorx))
You're close! There are a few functions that will make this easy for you, namely cumsum() and prop.table(). Here's how I'd probably put this together. I make some random data, but the point is the same:
#Fake data
x <- sample(10:20, 44, TRUE)
#Your code
factorx <- factor(cut(x, breaks=nclass.Sturges(x)))
#Tabulate and turn into data.frame
xout <- as.data.frame(table(factorx))
#Add cumFreq and proportions
xout <- transform(xout, cumFreq = cumsum(Freq), relative = prop.table(Freq))
#-----
factorx Freq cumFreq relative
1 (9.99,11.4] 11 11 0.25000000
2 (11.4,12.9] 3 14 0.06818182
3 (12.9,14.3] 11 25 0.25000000
4 (14.3,15.7] 2 27 0.04545455
5 (15.7,17.1] 6 33 0.13636364
6 (17.1,18.6] 3 36 0.06818182
7 (18.6,20] 8 44 0.18181818
The base functions table, cumsum and prop.table should get you there:
cbind( Freq=table(x), Cumul=cumsum(table(x)), relative=prop.table(table(x)))
Freq Cumul relative
10 2 2 0.04545455
12 2 4 0.04545455
15 1 5 0.02272727
16 10 15 0.22727273
17 16 31 0.36363636
18 6 37 0.13636364
19 4 41 0.09090909
20 2 43 0.04545455
22 1 44 0.02272727
With cbind and naming of the columns to your liking this should be pretty easy for you in the future. The output from the table function is a matrix, so this result is also a matrix. If this were being done on something big it would be more efficient todo this:
tbl <- table(x)
cbind( Freq=tbl, Cumul=cumsum(tbl), relative=prop.table(tbl))
If you are looking for something pre-packaged, consider the freq() function from the descr package.
library(descr)
x = c(sample(10:20, 44, TRUE))
freq(x, plot = FALSE)
Or to get cumulative percents, use the ordered() function
freq(ordered(x), plot = FALSE)
To add a "cumulative frequencies" column:
tab = as.data.frame(freq(ordered(x), plot = FALSE))
CumFreq = cumsum(tab[-dim(tab)[1],]$Frequency)
tab$CumFreq = c(CumFreq, NA)
tab
If your data has missing values, a valid percent column is added to the table.
x = c(sample(10:20, 44, TRUE), NA, NA)
freq(ordered(x), plot = FALSE)
Yet another possibility:
library(SciencesPo)
x = c(sample(10:20, 50, TRUE))
freq(x)
My suggestion is to check the agricolae package... check it out:
library(agricolae)
weight<-c( 68, 53, 69.5, 55, 71, 63, 76.5, 65.5, 69, 75, 76, 57, 70.5,
+ 71.5, 56, 81.5, 69, 59, 67.5, 61, 68, 59.5, 56.5, 73,
+ 61, 72.5, 71.5, 59.5, 74.5, 63)
h1<- graph.freq(weight,col="yellow",frequency=1,las=2,xlab="h1")
print(summary(h1),row.names=FALSE)

Resources