Related
I have two dataframes. These are examples of my dataframes:
#dataframe 1
Department <- c("ABS", "ABS", "ABS", "POL", "DOT")
Division <- c("BDO", "BL", "BL", "FSB", "DMS")
Gender <- c("M", "M", "M", "F", "M")
Grade <- c("15", "12","12", "16", "M2")
Salary20 <- c("47", "43", "41", "71", "16")
Overtime20 <- c("17", "43", "63", "0", "0")
df1 <- data.frame(Department, Division, Gender, Grade, Salary20, Overtime20)
df1
#dataframe 2
Department <- c("ABS", "ABS", "ABS", "POL", "HHS")
Division <- c("BDO", "BL", "BL", "FSB", "BHC")
Gender <- c("M", "M", "M", "F","F")
Grade <- c("15", "12","12", "16", "24")
Salary21 <- c("49", "45", "43", "72", "100")
Overtime21 <- c("35", "11", "10", "0", "40")
df2 <- data.frame(Department, Division, Gender, Grade, Salary21, Overtime21)
df2
I'll like to merge or left join df2$Salary21 & df2$Overtime21 to df1 based on Divison + Gender + Grade just like the dataframe below:
#Final dataframe
Department <- c("ABS", "ABS", "ABS", "POL", "DOT", "HHS")
Division <- c("BDO", "BL", "BL", "FSB", "DMS", "BHC")
Gender <- c("M", "M", "M", "F", "F", "F")
Grade <- c("15", "12","12", "16", "M2", "24")
Salary20 <- c("47", "43", "41", "71", "16", "0")
Overtime20 <- c("17", "43", "63", "0", "0", "0")
Salary21 <- c("49", "45", "43", "72", "0", "100")
Overtime21 <- c("35", "11", "10", "0", "0", "40")
df <- data.frame(Department, Division, Gender, Grade, Salary20, Overtime20, Salary21, Overtime21)
df
The problem is that there is no unique identifier (UID) and some rows are identical in Divison + Gender + Grade (example: ABS rows).
If this possible to merge? If so how?
I do not know of any way to accurately combine these dataframes without a unique identifier - as you mentioned, the identical rows mean that Division, Gender, and Grade are not enough.
I would try to add my own UID. The ease of this depends on how the data is formatted of course. I've included an example below to show how this then makes using merge or join easy.
#dataframe 1
Department <- c("ABS_1", "ABS_2", "ABS_3", "POL_1", "DOT_1")
Division <- c("BDO", "BL", "BL", "FSB", "DMS")
Gender <- c("M", "M", "M", "F", "M")
Grade <- c("15", "12","12", "16", "M2")
Salary20 <- c("47", "43", "41", "71", "16")
Overtime20 <- c("17", "43", "63", "0", "0")
df1 <- data.frame(Department, Division, Gender, Grade, Salary20, Overtime20)
df1
#dataframe 2
Department <- c("ABS_1", "ABS_2", "ABS_3", "POL_1", "HHS_1")
Division <- c("BDO", "BL", "BL", "FSB", "BHC")
Gender <- c("M", "M", "M", "F","F")
Grade <- c("15", "12","12", "16", "24")
Salary21 <- c("49", "45", "43", "72", "100")
Overtime21 <- c("35", "11", "10", "0", "40")
df2 <- data.frame(Department, Division, Gender, Grade, Salary21, Overtime21)
df2
# combine
df <- merge(df1,df2, by = c("Department","Division","Gender","Grade"), all = TRUE)
I have a large dataset and I want to subtract specific columns from each other based on their position. I want to subtract column 2 from column 8, column 3 from column 9 and column 4 from column 10.
Thanks a lot
Magnus
structure(list(Stamp_summertime = structure(c(1546684744, 1546685858,
1546687004, 1547030061, 1547030835, 1547031816), tzone = "UTC", class = c("POSIXct",
"POSIXt")), X26.013 = c(0.138461, 0.138461, 0.138461, 0.144421,
0.144421, 0.144421), X27.024 = c(0.0752111, 0.0752111, 0.0752111,
0.0426819, 0.0426819, 0.0426819), X33.031 = c(3.75788, 3.75788,
3.75788, 3.12581, 3.12581, 3.12581), jar_camp = c("1_pf1.1",
"2_pf1.1", "3_pf1.1", "1_pf2.1", "2_pf2.1", "3_pf2.1"), jar = structure(c(1L,
12L, 23L, 1L, 12L, 23L), .Label = c("1", "10_blank", "11", "12",
"13", "14", "15", "16_blank", "17", "18", "19", "2", "20_blank",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "3", "30_blank",
"31", "32", "33", "34", "35", "36", "37", "38_blank", "39", "4",
"40", "41", "42", "43", "44_blank", "45", "46", "47", "48", "49",
"5_blank", "blank_50", "51", "52", "53", "54", "55", "56", "57",
"6", "7", "8", "9", "X_blank"), class = "factor"), campaign = c("pf1.1",
"pf1.1", "pf1.1", "pf2.1", "pf2.1", "pf2.1"), i.X26.013 = c(0.144658,
0.21502, 0.458296, 0.191571, 0.0789067, 0.711814), i.X27.024 = c(0.0595547,
0.0651149, 0.146772, 0.0997815, 0.0539976, 0.185398), i.X33.031 = c(5.4066,
3.30406, 18.0479, 6.13854, 1.3028, 22.2226)), sorted = "Stamp_summertime", class = c("data.table",
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x00000237a3d91ef0>)
We can create 2 vectors of position and subtract the columns directly. Since you have data.table we use ..column_number to select columns by position.
library(data.table)
col1group <- 2:4
col2group <- 8:10
df[, ..col1group] - df[, ..col2group])
If you want to add them as new columns to original data you can rename them and cbind
cbind(df, setNames(df[, ..col1group] - df[, ..col2group],
paste0(names(df)[col1group], '_diff')))
Something like the following computes the subtractions in the question.
library(data.table)
nms <- names(df1)
iCols <- grep("^i\\.", nms, value = TRUE)
Cols <- sub("^i\\.", "", iCols)
df1[, lapply(seq_along(Cols), function(i) get(Cols[i]) - get(iCols[i]))]
# V1 V2 V3
#1: -0.0061970 0.0156564 -1.64872
#2: -0.0765590 0.0100962 0.45382
#3: -0.3198350 -0.0715609 -14.29002
#4: -0.0471500 -0.0570996 -3.01273
#5: 0.0655143 -0.0113157 1.82301
#6: -0.5673930 -0.1427161 -19.09679
Following Ronak Shah's answer I realized that the code below also works.
df1[, ..Cols] - df1[, ..iCols]
The numeric results are the same but the column names are the vector Cols.
To create new columns, try
newCols <- paste(Cols, "diff", sep = "_")
df1[, (newCols) := lapply(seq_along(Cols), function(i) get(Cols[i]) - get(iCols[i]))]
Base R solution:
idx <- c(2, 3, 4)
jdx <- c(8, 9, 10)
Using lapply() and column binding the list:
setNames(do.call("cbind", lapply(seq_along(idx), function(i){
df[, jdx[i], drop = FALSE] - df[, idx[i], drop = FALSE]
}
)
), c(paste("x", jdx, idx, sep = "_")))
Using sapply() and coercing vectors to a data.frame:
setNames(data.frame(sapply(seq_along(idx), function(i){
df[, jdx[i], drop = FALSE] - df[, idx[i], drop = FALSE]
}
)
), c(paste("x", jdx, idx, sep = "_")))
Using Map() and Reduce() and column binding to original data.frame:
cbind(df, setNames(Reduce(cbind, Map(function(i){
df[, jdx[i], drop = FALSE] - df[, idx[i], drop = FALSE]
}, seq_along(idx))), c(paste("x", jdx, idx, sep = "_"))))
I have done species accumulation curves and would like to plot the SAC results of different substrate sizeclasses in the same ggplot, with expected species richness on y-axis and number of sites samples on x-axis. The data features a cumulative number of samples in each sizeclass (column "sites"), the expected species richness (column "richness"), and substrate size classes 10, 20 and 30 (column "sc").
sites richness sc
1 1 0.6696915 10
2 2 1.2008513 10
3 3 1.6387310 10
4 4 2.0128472 10
5 5 2.3424933 10
6 6 2.6403239 10
sites richness sc
2836 1 1.000000 20
2837 2 1.703442 20
2838 3 2.249188 20
2839 4 2.706618 20
2840 5 3.110651 20
2841 6 3.479173 20
I want each sizeclass to have unique linetype. I used the following code for ggplot:
sac_kaikki<-ggplot(sac_data, aes(x=sites, y=richness,group=sc)) +
geom_line(aes(linetype=sc))+
coord_cartesian(xlim=c(0,100))+
theme(axis.title.y = element_blank())+
theme(axis.title.x = element_blank())
However, instead of getting three neat lines in different linetypes, I got [this jumbly muddly messy thing with more stripes than a herd of zebras][1]: https://i.stack.imgur.com/iD75K.jpg. I am sure the solution is rather simple, but for my life I am not able to figure it out.
// as Brookes kindly pointed out I should add some reproducible data, here is a subset of my data with dput, featuring 10 first observations of size classes 10 and 20:
dput(head(subset(sac_data,sac_data$sc=="10"),10))
structure(list(sites = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), richness = c(0.669691470054462,
1.20085134466255, 1.63873100707468, 2.01284716414471, 2.34249332096243,
2.64032389106845, 2.91468283244696, 3.17111526890278, 3.41334794519086,
3.64392468817362), sc = c("10", "10", "10", "10", "10", "10",
"10", "10", "10", "10")), .Names = c("sites", "richness", "sc"
), row.names = c(NA, 10L), class = "data.frame")
dput(head(subset(sac_data,sac_data$sc=="20"),10))
structure(list(sites = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), richness = c(0.999999999999987,
1.70344155844158, 2.24918831168832, 2.70661814764865, 3.11065087175364,
3.47917264517669, 3.82165739030286, 4.14341144680334, 4.44765475554031,
4.73653870494466), sc = c("20", "20", "20", "20", "20", "20",
"20", "20", "20", "20")), .Names = c("sites", "richness", "sc"
), row.names = 2836:2845, class = "data.frame")
// okay so for whatever reason, the plot works just fine if I plot only two sizeclasses, but including the third one produces the absurd plot I posted a picture of.
structure(list(sites = 1:10, richness = c(0.42857142857143, 0.838095238095238,
1.22932330827066, 1.60300751879699, 1.95989974937343, 2.30075187969924,
2.62631578947368, 2.93734335839598, 3.23458646616541, 3.5187969924812
), sc = c("30", "30", "30", "30", "30", "30", "30", "30", "30",
"30")), .Names = c("sites", "richness", "sc"), row.names = c(NA,
10L), class = "data.frame")
Works fine for me with your sample data:
a <- structure(list(sites = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), richness = c(0.669691470054462,
1.20085134466255, 1.63873100707468, 2.01284716414471, 2.34249332096243,
2.64032389106845, 2.91468283244696, 3.17111526890278, 3.41334794519086,
3.64392468817362), sc = c("10", "10", "10", "10", "10", "10",
"10", "10", "10", "10")), .Names = c("sites", "richness", "sc"
), row.names = c(NA, 10L), class = "data.frame")
b <- structure(list(sites = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), richness = c(0.999999999999987,
1.70344155844158, 2.24918831168832, 2.70661814764865, 3.11065087175364,
3.47917264517669, 3.82165739030286, 4.14341144680334, 4.44765475554031,
4.73653870494466), sc = c("20", "20", "20", "20", "20", "20",
"20", "20", "20", "20")), .Names = c("sites", "richness", "sc"
), row.names = 2836:2845, class = "data.frame")
c <- structure(list(sites = 1:10, richness = c(0.42857142857143, 0.838095238095238,
1.22932330827066, 1.60300751879699, 1.95989974937343, 2.30075187969924,
2.62631578947368, 2.93734335839598, 3.23458646616541, 3.5187969924812
), sc = c("30", "30", "30", "30", "30", "30", "30", "30", "30",
"30")), .Names = c("sites", "richness", "sc"), row.names = c(NA,
10L), class = "data.frame")
sac_data <- bind_rows(a, b, c)
Plotting:
ggplot(sac_data, aes(sites, richness, group = sc)) +
geom_line(aes(linetype = sc))
I am new to R and would like to read in a list of files as separate data frames, perform several operations on each, and save them out as separate files with dynamic file names. I am thinking I should use lappy, but not sure.
Here is the code I wrote that works for one file:
df <- read.fwf('USC00011084.dly', widths = c(21, rep(c(5, 1, 1, 1),31)))
df2 <- df[-c(3:5, 7:9, 11:13, 15:17, 19:21, 23:25, 27:29, 31:33, 35:37, 39:41, 43:45, 47:49, 51:53, 55:57, 59:61, 63:65, 67:69, 71:73, 75:77, 79:81, 83:85, 87:89, 91:93, 95:97, 99:101, 103:105, 107:109, 111:113, 115:117, 119:121, 123:125)]
df2[df2=="-9999"]<-NA
df$new <- rowSums(df2[,2:32], na.rm = TRUE)
df2["Total"] <- df$new
colnames(df2) <- c("StationDateType", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "28", "30", "31", "TotalMonthly")
Prcp <- df2[grep("PRCP", df2$StationDateType),]
write.table(Prcp, "USC00011084Prcp.txt", sep="\t", row.names=FALSE)
How can I do this for a list of files in a directory? Any ideas? Thank you.
You can try this...
You can get a list of your files:
files <- list.files(getwd())
Write a function that performs the analysis you want and writes the results to table, as you have done. Here we use tools::file_path_sans_ext to extract the filename (without the file type extension), and at the end use it to name the table to be saved to txt.
myFunction <- function(files){
fileName <- tools::file_path_sans_ext(files)
df <- read.fwf(files, widths = c(21, rep(c(5, 1, 1, 1),31)))
# rest of your code
# ...
write.table(Prcp, paste0(fileName, "Prcp.txt"), sep="\t", row.names=FALSE)
}
You can use lapply to run your function on each file in files.
lapply(files, function(x) myFunction(x))
I have a dataframe called mydf. I also have a vector called myvec <- c("chr5:11", "chr3:112", "chr22:334"). What I want to do is select range (including 3 values above and 3 values below) of rows if any of the vector elements match the key in mydf and make a subset of mydf(result).
Since in the myvec we have chr5:11 matching with the key in mydf, we are selecting rows matching chr5:8 (three values below) to chr5:14 (three values above) in the result.
mydf<- structure(list(key = structure(c(5L, 2L, 7L, 8L, 4L, 1L, 6L,
3L, 11L, 10L, 9L), .Names = c("34", "35", "36", "37", "38", "39",
"40", "41", "42", "43", "44"), .Label = c("chr5:10", "chr5:11",
"chr5:1123", "chr5:118", "chr5:12", "chr5:123", "chr5:13", "chr5:14",
"chr5:19", "chr5:8", "chr5:9"), class = "factor"), variantId = structure(1:11, .Names = c("34",
"35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), .Label = c("9920068",
"9920069", "9920070", "9920071", "9920072", "9920073", "9920074",
"9920075", "9920076", "9920077", "9920078"), class = "factor")), .Names = c("key",
"variantId"), row.names = c("34", "35", "36", "37", "38", "39",
"40", "41", "42", "43", "44"), class = "data.frame")
result
key variant
43 "chr5:8" "9920077"
42 "chr5:9" "9920076"
39 "chr5:10" "9920073"
35 "chr5:11" "9920069"
34 "chr5:12" "9920068"
36 "chr5:13" "9920070"
37 "chr5:14" "9920071"
How about the following (I use data.table but the base version is almost the same)
library(data.table)
mydf <- as.data.table(mydf) #(if mydf really is stored as a matrix currently)
myvec2 <- lapply(strsplit(gsub("chr", "", myvec), split=":"), as.integer)
mydf[unique(Reduce(c, sapply(myvec2, function(x){
which(key %in% paste0("chr", x[1], ":", seq((x2 <- x[2]) - 3L, x2 + 3L)))}
))), ]
(in base, replace as.data.table with as.data.frame,key with mydf$key, and replace the closing square bracket ] with ,])
Extra option for sorting
Actually, I think this option is better in general, since it stores your information in a more pliable way in the first place. This version's a bit heavier in the data.table parlance.
mydf <- as.data.table(mydf)
#Split your `key` variable into its pre- and post-colon components
# (of course using better names if those numbers mean something
# more specific to you)
mydf[ , c("chr", "sub") :=
.(as.integer(gsub("chr|:.*", "", key)),
as.integer(gsub(".*:", "", key)))]
Now, proceeding much as before with a slight tweak:
myvec2<-lapply(strsplit(gsub("chr","",myvec),split=":"),as.integer)
mydf[unique(Reduce(c, sapply(myvec2, function(x){
which(chr == x[1] & sub %in% seq((x2 <- x[2]) - 3L, x2 + 3L))}
)))][order(chr, sub)]
Outputs:
key variantId chr sub
1: chr5:8 9920077 5 8
2: chr5:9 9920076 5 9
3: chr5:10 9920073 5 10
4: chr5:11 9920069 5 11
5: chr5:12 9920068 5 12
6: chr5:13 9920070 5 13
7: chr5:14 9920071 5 14
You can use the GenomicRanges package.
library(GenomicRanges)
myvec <- c("chr5:11", "chr3:112", "chr22:334")
myvec.gr <- GRanges(gsub(":.+", "", myvec),
IRanges(as.numeric(gsub(".+:", "", myvec))-3,
as.numeric(gsub(".+:", "", myvec)))+3)
mydf.gr <- GRanges(gsub(":.+", "", mydf[,"key"]),
IRanges(as.numeric(gsub(".+:", "", mydf[,"key"])),
as.numeric(gsub(".+:", "", mydf[,"key"]))))
d.v.op <- findOverlaps(mydf.gr, myvec.gr)
mydf[queryHits(d.v.op), ]
# key variantId
# 34 "chr5:12" "9920068"
# 35 "chr5:11" "9920069"
# 36 "chr5:13" "9920070"
# 37 "chr5:14" "9920071"
# 39 "chr5:10" "9920073"
# 42 "chr5:9" "9920076"
# 43 "chr5:8" "9920077"