How to generate a sequence based on two columns in R? - r

Below you can recreate my data in R. I would like to generate a sequence of numbers based on two individual columns. In this example of real data my column names are :
df= or10x1BC
"Tank" "Core" "BCl" "BCu" "Mid" "TL" "SL"
I wish to use the value in each row from BCu and BCl to generate a sequence by 0.001. For example seq(BCu[1], BCl[1], 0.001) will generate a sequence based on the first row in each, I wish to have this work for each row down the list.
Ultimately this sequence will be used in my function to make an average of the sequence, i.e. mean(function(seq(Bcu[i], BCl[j], 0.001)) and be added to a new column or10x1BC["meanBVF"] = mean(function(seq(Bcu[i], BCl[j], 0.001)).
See data below:
structure(list(Tank = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "1", class = "factor"), Core = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
BCl = structure(c(8L, 5L, 2L, 6L, 3L, 1L, 9L, 7L, 4L), .Label = c("17",
"18", "22", "22.3", "23", "26", "27.3", "28", "29"), class = "factor"),
BCu = structure(c(8L, 5L, 2L, 6L, 3L, 1L, 9L, 7L, 4L), .Label = c("12.5",
"13.5", "17", "17.8", "18", "22", "22.3", "23", "27.3"), class = "factor"),
Mid = structure(c(8L, 5L, 2L, 6L, 3L, 1L, 9L, 7L, 4L), .Label = c("14.75",
"15.75", "19.5", "20.05", "20.5", "24", "24.8", "25.5", "28.15"
), class = "factor"), TL = structure(c(2L, 2L, 2L, 1L, 1L,
1L, 3L, 3L, 3L), .Label = c("26", "28", "29"), class = "factor"),
SL = structure(c(4L, 4L, 3L, 2L, 4L, 3L, 1L, 4L, 3L), .Label = c("1.7",
"4", "4.5", "5"), class = "factor")), .Names = c("Tank",
"Core", "BCl", "BCu", "Mid", "TL", "SL"), row.names = c(NA, -9L
), class = "data.frame")

mapply is like apply, or lapply, but with multiple arguments:
First, as I mentioned in the comment, we need to convert your data to numeric. I did it like this, to convert all but the second column:
df[, -2] = lapply(df[, -2], as.character)
df[, -2] = lapply(df[, -2], as.numeric)
We can then use mapply like this to generate the sequences:
seqs = mapply(FUN = function(a, b) {
seq(from = a, to = b, by = .001)
}, a = df$BCu, b = df$BCl)
It seems messy to put that in the data frame, but you can if you'd like:
df$seqs = seqs
If it were me, I'd probably leave it as a list of vectors outside of the data frame.

Related

How can I use R to fill rows based on column?

I have the following table
Code Name Class
1
2 Monday day
5 green color
9
6
1 red color
1
2
9 Tuesday day
6
5
Goal is to the fill the Name and Class columns based on the Code column of a filled row. For example, the second row is filled and the code is 2. I would like to fill all the rows where code = 2 with Name=Monday and Class=day.
I tried using fill() from tidyR but that seems to require ordered data.
structure(list(Code = c(1L, 2L, 5L, 9L, 6L, 1L, 1L, 2L, 9L, 6L,
5L), Name = structure(c(1L, 3L, 2L, 1L, 1L, 4L, 1L, 1L, 5L, 1L,
1L), .Label = c("", "green", "Monday", "red", "Tuesday"), class = "factor"),
Class = structure(c(1L, 3L, 2L, 1L, 1L, 2L, 1L, 1L, 3L, 1L,
1L), .Label = c("", "color", "day"), class = "factor")), .Names = c("Code",
"Name", "Class"), class = "data.frame", row.names = c(NA, -11L
))
library(dplyr)
final_df <- left_join(df, df[df$Name!='',], by='Code')[,c(1,4:5)]
colnames(final_df) <- colnames(df)
final_df

How to convert all column data type to numeric and character dynamically?

I convert my columns data type manually:
data[,'particles'] <- as.numeric(as.character(data[,'particles']))
This not ideal as the data may evolve and I won't be sure what species coming, for instance they could be - "nox", "no2", "co", "so2", "pm10" and more in the future.
Is there anyway to convert them automatically?
My current dataset:
structure(list(particles = structure(c(1L, 3L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 5L, 6L, 5L, 3L), .Label = c("1", "11", "1.1",
"2", "2.1", "3.1"), class = "factor"), humidity = structure(c(4L,
7L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 6L, 1L, 1L, 1L,
5L, NA, NA, NA, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0.1",
"1", "1.1", "1.3", "21", "2.1", "3"), class = "factor"), timestamp = c(1468833354929,
1468833365186, 1468833378458, 1468833538213, 1468833538416, 1468833538613,
1468833538810, 1468833538986, 1468833539172, 1468833539358, 1468833539539,
1468833554592, 1468833559059, 1468833562357, 1468833566225, 1468833573486,
1468840019118, 1468840024950, 1469029568849, 1469029584243, 1469029590530,
1469029622391, 1469029623598, 1469245154003, 1469245156533, 1469245156815,
1469245157123, 1469245162358, 1469245165911, 1469245170178, 1469245173788
), date = structure(c(1468833354.929, 1468833365.186, 1468833378.458,
1468833538.213, 1468833538.416, 1468833538.613, 1468833538.81,
1468833538.986, 1468833539.172, 1468833539.358, 1468833539.539,
1468833554.592, 1468833559.059, 1468833562.357, 1468833566.225,
1468833573.486, 1468840019.118, 1468840024.95, 1469029568.849,
1469029584.243, 1469029590.53, 1469029622.391, 1469029623.598,
1469245154.003, 1469245156.533, 1469245156.815, 1469245157.123,
1469245162.358, 1469245165.911, 1469245170.178, 1469245173.788
), class = c("POSIXct", "POSIXt"), tzone = "Asia/Singapore")), .Names = c("particles",
"humidity", "timestamp", "date"), row.names = c(NA, -31L), class = "data.frame")
It has particles, humidity, timestamp, date.
Another option using mutate_if() from dplyr which allows you to operate on columns for which a predicate returns TRUE
library(dplyr)
df %>%
mutate_if(is.factor, funs(as.numeric(as.character(.))))
Note: This method will work for your follow up question as well
If you don't know which columns need to be converted beforehand, you can extract that info from your dataframe as follows:
vec <- sapply(dat, is.factor)
which gives:
> vec
particles humidity timestamp date
TRUE TRUE FALSE FALSE
You can then use this vector to do the conversion on the subset with lapply:
# notation option one:
dat[, vec] <- lapply(dat[, vec], function(x) as.numeric(as.character(x)))
# notation option two:
dat[vec] <- lapply(dat[vec], function(x) as.numeric(as.character(x)))
If you want to detect both factor and character columns, you can use:
sapply(dat, function(x) is.factor(x)|is.character(x))
We can use data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) if(is.factor(x)) as.numeric(as.character(x)) else x)]
The best option is I think apply
You can do
newD<-apply(data[,"names"], 2,function(x) as.numeric(as.character(x)))
where in "names" you put all the variables you want. Then apply with 2 as second argument will apply the function(x) on all the columns(if you put 1 its by rows) of the first argument. And you can save it as new dataset or rewrite the old one with
data[,"names"]<-apply....
Use lapply:
cols <- c("particles", "nox", ...)
data[,cols] <- lapply(data[,cols], function(x) as.numeric(as.character(x)))

TukeyHSD specific conditions

Im trying to get an TukeyHSD to run in R, my code looks like this:
#----------------------------------------------------------------------------------------#
# RING data:
#----------------------------------------------------------------------------------------#
library(doBy)
# Set working directory
setwd("")
#### Read data & Converting factors ####
dat <- read.table("afstand.txt", header =TRUE)
str(dat)
dat$Vial <- as.factor(dat$Vial)
dat$Line <- as.factor(dat$Line)
dat$Fly <- as.factor(dat$Fly)
dat$Temp <- as.factor(dat$Temp)
str(dat)
datSUM <- summaryBy(X0.5_sec+X1_sec+X1.5_sec+X2_sec+X2.5_sec+X3_sec~Vial_nr+Concentration+Sex+Line+Vial+Temp,data=dat, FUN=sum)
fl<-levels(datSUM$Line)
aov1 <- aov(X0.5_sec.sum ~ Concentration*Sex*Line*Temp, data=datSUM)
summary(aov1) #Overview of model
TukeyHSD(aov1, 'Line',ordered = TRUE, conf.level = 0.95)
What I would like to do is look at interactions between Line and Temp for instance, but if I run TukeyHSD(aov1) then i get ALL the interactions, resulting in this error: [ reached getOption("max.print") -- omitted 3716 rows ] Is there a way where i can specify that i wanna test only between Line and Temp and not all combinations, or a way of showing only significant results if I just run TukeyHSD(avo1)?
I have tried using TukeyHSD(aov1, 'Line,Temp',ordered = TRUE, conf.level = 0.95) , TukeyHSD(aov1, 'Line':'Temp',ordered = TRUE, conf.level = 0.95) and TukeyHSD(aov1, 'Line'&'Temp',ordered = TRUE, conf.level = 0.95) but with no luck.
structure(list(Concentration = structure(c(2L, 7L, 7L, 1L, 7L,
1L, 2L, 1L, 7L, 1L, 4L, 2L, 2L, 1L, 2L, 4L, 7L, 2L, 2L, 1L), .Label = c("a",
"b", "c", "d", "e", "x", "y"), class = "factor"), Sex = structure(c(1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L), .Label = c("f", "m"), class = "factor"), Line = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L,
3L, 3L, 3L), .Label = c("20", "23", "40", "73"), class = "factor"),
Temp = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L), .Label = c("23",
"29"), class = "factor"), X0.5_sec.sum = c(107.19, 46.17,
58.05, 75.87, 87.75, 71.55, 46.17, 47.25, 22.41, 31.05, 27.36,
79.11, 87.58, 21.33, 34.29, 60.4, 85.05, 72.47, 114.21, 67.77
)), .Names = c("Concentration", "Sex", "Line", "Temp", "X0.5_sec.sum"
), row.names = c(NA, 20L), class = "data.frame")
To only show interactions between variables Line and Temp, you can specify the argument whichas follows:
which = 'Line:Temp'
which then turns your complete function call to TukeyHSDinto:
TukeyHSD(aov1, 'Line:Temp', ordered = TRUE, conf.level = 0.95)

R - Convert List of Lists into single dataframe

So, I have created a list (and a single column matrix) that contains 256 nested lists. What I would like to do, is to convert each of the 256 lists into a single dataframe of 16 columns and then write.table it. Although each list contains the same number of columns (16), the number of rows for each list varies. I have tried to use unlist unsuccessfully because the changing row counts. I can subset each list individually, so I know there's an easier way to do the whole list.
I'm pretty new to R, so I apologize for asking what may be a naive novice question. I searched through a lot of topics the last couple days and didn't see anything that seemed to match my problem. for loop seems like it might be unnecessary and I wasn't sure if lapply was the correct route, either.
UPDATE: dput of first list:
list(structure(list(structure(c(2L, 11L, 15L, 8L, 7L, 3L, 6L, 10L,
1L, 1L, 18L, 13L, 14L, 19L, 16L, 17L, 4L, 5L, 9L, 12L), .Label = c("",
"Aaron Rodgers", "Andrew Quarless", "Derrick Coleman", "Doug Baldwin",
"DuJuan Harris", "Eddie Lacy", "James Starks", "Jermaine Kearse",
"John Kuhn", "Jordy Nelson", "Luke Willson", "Marshawn Lynch", "Percy
Harvin", "Randall Cobb", "Ricardo Lockette", "Robert Turbin",
"Russell Wilson", "Zach Miller"), class = "factor"), Tm =
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 4L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("GNB", "Passing", "SEA", "Tm"),
class = "factor"), Cmp = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "19",
"23", "Cmp", "Rushing"), class = "factor"), Att = structure(c(3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "28", "33", "Att", "Receiving"
), class = "factor"), Yds = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, NA, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "189", "191", "Yds"), class = "factor"),
TD = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "1",
"2", "TD"), class = "factor"), Int = structure(c(3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, NA, 4L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "0", "1", "Int"), class = "factor"),
Lng = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "23",
"33", "Lng"), class = "factor"), Att = structure(c(1L, 1L,
1L, 7L, 3L, 1L, 2L, 2L, NA, 8L, 7L, 4L, 5L, 1L, 1L, 6L, 1L,
1L, 1L, 1L), .Label = c("", "1", "12", "20", "4", "6", "7",
"Att"), class = "factor"), Yds = structure(c(1L, 1L, 1L,
7L, 6L, 1L, 9L, 3L, NA, 10L, 5L, 2L, 8L, 1L, 1L, 4L, 1L,
1L, 1L, 1L), .Label = c("", "110", "2", "27", "29", "34",
"37", "41", "7", "Yds"), class = "factor"), TD = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 2L, 3L, NA, 5L, 2L, 4L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L), .Label = c("", "0", "1", "2", "TD"), class = "factor"),
Lng = structure(c(1L, 1L, 1L, 2L, 4L, 1L, 8L, 6L, NA, 9L,
3L, 7L, 5L, 1L, 1L, 8L, 1L, 1L, 1L, 1L), .Label = c("", "12",
"13", "15", "16", "2", "21", "7", "Lng"), class = "factor"),
Rec = structure(c(1L, 7L, 5L, 3L, 4L, 4L, 1L, 1L, NA, 8L,
1L, 2L, 6L, 4L, 3L, 1L, 2L, 4L, 2L, 2L), .Label = c("", "1",
"2", "3", "6", "7", "9", "Rec"), class = "factor"), Yds = structure(c(1L,
12L, 9L, 3L, 3L, 6L, 1L, 1L, NA, 13L, 1L, 4L, 10L, 8L, 7L,
1L, 5L, 4L, 11L, 2L), .Label = c("", "1", "11", "14", "15",
"26", "38", "42", "58", "59", "8", "83", "Yds"), class = "factor"),
TD = structure(c(1L, 2L, 3L, 2L, 2L, 2L, 1L, 1L, NA, 4L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L), .Label = c("", "0",
"1", "TD"), class = "factor"), Lng = structure(c(1L, 7L,
9L, 3L, 4L, 8L, 1L, 1L, NA, 14L, 1L, 5L, 11L, 10L, 11L, 1L,
6L, 12L, 13L, 2L), .Label = c("", "1", "11", "12", "14",
"15", "16", "18", "23", "24", "33", "6", "8", "Lng"), class = "factor")), .Names = c("", "Tm", "Cmp", "Att", "Yds", "TD", "Int",
"Lng", "Att", "Yds", "TD", "Lng", "Rec", "Yds", "TD", "Lng"),
row.names = c(NA, -20L ), class = "data.frame"))
So, each observation in my list is like this above and I want to convert all of the lists into their 16 column(Now that I think about it, it's 17 columns, one is just unnamed) dataframe layout and stack all the rows together in one place that I can then write.table
Let's call your list l where l[[1]] is what you have dput above.
Two easy ways from base R and from data.table
do.call("rbind", l)
data.table::rbindlist(l)
This assumes that the columns match in each list element. Your example doesn't confirm this, although you state it.

Select (multiple) integers with n occurrences per row

I have a data.frame where the data entries are entered in this format 1,2,3,10. That is, they are comma separated integers that range from 0-20, and do not need to be consecutive. Each is currently considered a factor. I have four variables that contain these values, and I'd like to create a new variable, that includes a given integer only if it is in three of the the four variables, if there are not three occurrences of an integer, then use 0.
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
I am unsure on how to deal with these comma separated integers. If they were single integers, I could do something like this:
# modified from https://stackoverflow.com/a/14114085/1670053
# over each row of data.frame (or matrix)
sapply(1:nrow(df), function(idx) {
# get the number of time each entry in df occurs
t <- table(t(g[idx, ]))
# get the maximum count (or frequency)
if(max(t) > 2){
t.max <- max(t)
}else{ t.max <- 0
}
# get all values that equate to maximum count
t <- as.numeric(names(t[t == t.max]))
})
Though with these multiple values separated by commas, I am unsure where to start.
# data and example output
df <- structure(list(M1 = structure(c(3L, 2L, 2L, 5L, 3L, 1L, 7L, 1L,
8L, 1L, 3L, 4L, 3L, 6L), .Label = c("0", "1", "1,2", "1,2,3",
"1,2,3,4", "1,2,3,4,5", "3,4,5,6,7", "6,7,8,9,10,11,12,13,14,15,16"
), class = "factor"), M2 = structure(c(5L, 2L, 2L, 4L, 4L, 1L,
11L, 8L, 7L, 9L, 3L, 6L, 3L, 10L), .Label = c("0", "1,2", "1,2,3",
"1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16",
"2", "2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"), class = "factor"),
M3 = structure(c(4L, 1L, 1L, 8L, 3L, 1L, 6L, 1L, 7L, 3L,
2L, 5L, 9L, 3L), .Label = c("0", "1,2", "1,2,3,4", "1,2,3,4,5",
"1,2,3,4,5,6", "1,2,3,4,5,6,7,8", "1,2,3,4,5,6,7,8,9,10,11,12,13,14",
"3,4", "3,4,5"), class = "factor"), M4 = structure(c(5L,
1L, 2L, 8L, 2L, 1L, 6L, 3L, 4L, 1L, 3L, 3L, 7L, 9L), .Label = c("0",
"1", "1,2", "1,2,3,4,5,12,13,14,15,16,17", "1,2,3,4,5,6",
"1,2,3,4,5,6,7,8,9,10,11,12", "3,4", "4", "4,5"), class = "factor"),
M_NEW = structure(c(6L, 1L, 2L, 8L, 3L, 1L, 9L, 1L, 7L, 1L,
3L, 4L, 5L, 10L), .Label = c("0", "1", "1,2", "1,2,3", "1,2,3,",
"1,2,3,4,5", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16", "3,4",
"3,4,5,6,7,8", "4,5"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW"), class = "data.frame", row.names = c(NA,
-14L))
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(res == "", "0", res))
}
(df[, 5] <- apply(df[, 1:4], 1, f))
# [1] "1,2,3,4,5"
# [2] "0"
# [3] "1"
# [4] "3,4"
# [5] "1,2"
# [6] "0"
# [7] "3,4,5,6,7,8"
# [8] "0"
# [9] "1,10,11,12,13,14,15,16,2,3,4,5,6,7,8,9"
# [10] "0"
# [11] "1,2"
# [12] "1,2,3"
# [13] "3"
# [14] "4,5"

Resources