I have 2 files with say 3 columns and a few rows.
1 2 10
2 3 20
3 4 30
4 5 40
5 1 50
6 1 60
and
1 8 10
2 3 100
3 4 45
4 5 78
5 2 99
6 80 60
Now i want to create a third file having all the values of first two files and also if first and second column of both the files are same then in third file the values corresponding to them should like say,value in third column of first file must be in third column of newly created file and value in third column of second file must be in fourth column of newly created file.
According to above example answer should be
1 2 10 0
2 3 20 100
3 4 30 45
4 5 40 78
1 8 10 0
5 1 50 0
6 1 60 0
5 2 99 0
6 80 60 0
res <- merge(dat1,dat2, by=c("V1", "V2"),all=TRUE)
indx <- is.na(res[,3])
res[indx,3] <- res[indx,4]
res[indx,4] <- NA
res[is.na(res)] <- 0
# V1 V2 V3.x V3.y
#1 1 2 10 0
#2 1 8 10 0
#3 2 3 20 100
#4 3 4 30 45
#5 4 5 40 78
#6 5 1 50 0
#7 5 2 99 0
#8 6 1 60 0
#9 6 80 60 0
data
dat1 <- structure(list(V1 = structure(1:6, .Label = c("1", "2", "3",
"4", "5", "6"), class = "factor"), V2 = structure(c(2L, 3L, 4L,
5L, 1L, 1L), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
V3 = structure(1:6, .Label = c("10", "20", "30", "40", "50",
"60"), class = "factor")), .Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c(NA,
-6L))
dat2 <- structure(list(V1 = structure(1:6, .Label = c("1", "2", "3",
"4", "5", "6"), class = "factor"), V2 = structure(c(5L, 2L, 3L,
4L, 1L, 6L), .Label = c("2", "3", "4", "5", "8", "80"), class = "factor"),
V3 = structure(c(1L, 2L, 3L, 5L, 6L, 4L), .Label = c("10",
"100", "45", "60", "78", "99"), class = "factor")), .Names = c("V1",
"V2", "V3"), class = "data.frame", row.names = c(NA, -6L))
Convert the data columns to numeric class before you try the above code
dat1[] <- lapply(dat1, function(x) as.numeric(as.character(x)))
dat2[] <- lapply(dat2, function(x) as.numeric(as.character(x)))
It would be easier if you post an example with dput(). I would check if ?merge helps or rbind.fill (package plyr).
Hope this helps
Hermann
Related
I'd like to create a variable to count the number of unique values in each row for a subset of columns (i.e.,baseline,wave1,wave2,wave3). So far I have the below. I have included an example data set with a variable "example" to show what I am after. I also have included the variable "change", which shows the variable created using the code below.
# Create example data
data <- structure(list(age = c("18", "19", NA, "40", "21", "33", "32",
"34", "43", "22"), baseline = c("1", "1", NA, "4", "1", "3",
"2", "4", "3", "2"), wave1 = c("1", "1", "2", "4", "4", "3",
"2", "4", "3", "2"), wave2 = c("1", "1", "4", "4", NA, "3",
"2", "4", "3", "2"), wave3 = c("1", "2", NA, "4", "4", "3",
"2", "4", "3", "4"), example = c("1", "2", "2", "1", "2", "1",
"1", "1", "1", "2"), change = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L)), row.names = c(NA, -10L), groups = structure(list(.rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
library(dplyr)
# Create a var for change at any point (ignoring NAs)
data <- data %>%
rowwise() %>% #perform operation by row
mutate(change = length(unique(na.omit(baseline,wave1,wave2,wave3))))
We can use n_distinct where we can use the na.rm argument to remove the NA elements (though in the OP's data, it was "NA")
library(dplyr)
data %>%
type.convert(as.is = TRUE) %>%
rowwise %>%
mutate(change = n_distinct(c_across(baseline:wave3), na.rm = TRUE)) %>%
ungroup
-output
# A tibble: 10 × 7
age baseline wave1 wave2 wave3 example change
<int> <int> <int> <int> <int> <int> <int>
1 18 1 1 1 1 1 1
2 19 1 1 1 2 2 2
3 NA NA 2 4 NA 2 2
4 40 4 4 4 4 1 1
5 21 1 4 NA 4 2 2
6 33 3 3 3 3 1 1
7 32 2 2 2 2 1 1
8 34 4 4 4 4 1 1
9 43 3 3 3 3 1 1
10 22 2 2 2 4 2 2
Or a faster option with dapply from collapse
library(collapse)
data$change <- dapply(slt(ungroup(data), baseline:wave3),
MARGIN = 1, FUN = fndistinct)
I know this is an easy question but I´m really struggling and trying to be efficent with my code.
I have 3 different datasets:
head(Porto_2014)
+ select(points_acc)
points_acc
1 3
2 6
3 9
4 10
5 11
6 12
head(Porto_2015) %>%
+ select(points_acc)
points_acc
1 3
2 4
3 7
4 10
5 13
6 14
head(Porto_2016) %>%
+ select(points_acc)
points_acc
1 3
2 6
3 6
4 9
5 10
6 13
I want to create a new dataframe points_by_season with 3 columns named Season_X being X the year of the season.
I must recall that I want to be super eficient with lines used to code.
Thank you in advance
You can use cbind
In case Porto_2014, Porto_2015 and Porto_2016 have the same number of rows:
points_by_season <- cbind(Season_2014=Porto_2014$points_acc
, Season_2015=Porto_2015$points_acc
, Season_2016=Porto_2016$points_acc)
if they have not the same number of rows:
tt <- seq_len(max(nrow(Porto_2014), nrow(Porto_2015), nrow(Porto_2016)))
points_by_season <- cbind(Season_2014=Porto_2014$points_acc[tt]
, Season_2015=Porto_2015$points_acc[tt]
, Season_2016=Porto_2016$points_acc[tt])
One option is to load it into a list by getting the values of the objects (with mget), loop over the list (imap), select the column while renaming it replacing 'Porto' with 'Season' from the names of the list
library(dplyr)
library(purrr)
library(stringr)
imap_dfc(mget(str_c("Porto_", 2014:2016)), ~ .x %>%
select(!!str_replace(.y, "Porto", "Season") := points_acc))
# Season_2014 Season_2015 Season_2016
#1 3 3 3
#2 6 4 6
#3 9 7 6
#4 10 10 9
#5 11 13 10
#6 12 14 13
Or in base R
setNames(do.call(cbind, lapply(mget(paste0("Porto_", 2014:2016)),
`[`, 'points_acc')), paste0("Season_", 2014:2016))
data
Porto_2014 <- structure(list(points_acc = c(3L, 6L, 9L, 10L, 11L, 12L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
Porto_2015 <- structure(list(points_acc = c(3L, 4L, 7L, 10L, 13L, 14L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
Porto_2016 <- structure(list(points_acc = c(3L, 6L, 6L, 9L, 10L, 13L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
I have two dataframes ,df_1, is:
symbol Sample_name
1 MTPAP sample_1
2 MTPAP sample_1
3 MTPAP sample_1
4 TENT2 sample_1
5 KIDINS220 sample_2
6 POLR1A sample_3
7 CCDC138 sample_4
8 CCDC74A sample_5
9 ATF2 sample_6
10 TLR9 sample_7
and df_2 is:
HGNC.ID symbol
1 HGNC:25532 MTPAP
2 HGNC:26776 TENT2
3 HGNC:16705 TENT4A
4 HGNC:30758 TENT4B
5 HGNC:26184 TUT1
6 HGNC:28981 TUT4
7 HGNC:25817 TUT7
8 HGNC:17264 POLR1A
9 HGNC:20454 POLR1B
10 HGNC:20194 POLR1C
I would like to make a matrix with column names matching the values present in the “Sample_name” from df_1 and row names matching the "symbol” from df_2. The values for each “symbol”/“sample_name” pair should be “1" if a "symbol” value from df_2 is present as “symbol” in df_1 with sample name in df_1, and “0" if a given “symbol”/“sample_name” doesn't exists in df_1:
Sample_1 Sample_2 Sampl_3
MTPAP 1 0 0
TENT2 1 0 0
TENT4A 0 0 0
TENT4B 0 0 0
TUT1 0 0 0
TUT4 0 0 0
TUT7 0 0 0
POLR1A 0 0 1
POLR1B 0 0 0
POLR1C 0 0 0
I created an empty the matrix with:
c <- matrix(data = NA, nrow = length(unique(df_2$symbol)), ncol = length(unique(df_1$Sample_name)))
colnames(c) <- unique(df_1$Sample_name)
rownames(c) <- unique(df_2$symbol)
and I had a few attempts at filling it with the data from df_1 and df_2, but so far I have failed miserably…
Could anyone help me please?
Thanks in advance.
Use merge and then dcast from package reshape2.
res <- merge(df_2[2], df_1, all.x = TRUE)
res$Sample_name <- as.character(res$Sample_name)
res$Sample_name[is.na(res$Sample_name)] <- ""
reshape2::dcast(res, symbol ~ Sample_name, value.var = "Sample_name")
# symbol Var.2 sample_1 sample_3
#1 MTPAP 0 3 0
#2 POLR1A 0 0 1
#3 POLR1B 1 0 0
#4 POLR1C 1 0 0
#5 TENT2 0 1 0
#6 TENT4A 1 0 0
#7 TENT4B 1 0 0
#8 TUT1 1 0 0
#9 TUT4 1 0 0
#10 TUT7 1 0 0
Data in dput format.
df_1 <-
structure(list(symbol = structure(c(5L, 5L, 5L,
7L, 4L, 6L, 2L, 3L, 1L, 8L), .Label = c("ATF2",
"CCDC138", "CCDC74A", "KIDINS220", "MTPAP",
"POLR1A", "TENT2", "TLR9"), class = "factor"),
Sample_name = structure(c(1L, 1L, 1L, 1L, 2L,
3L, 4L, 5L, 6L, 7L), .Label = c("sample_1", "sample_2",
"sample_3", "sample_4", "sample_5", "sample_6",
"sample_7"), class = "factor")),
class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))
df_2 <-
structure(list(HGNC.ID = structure(c(5L,
8L, 1L, 10L, 7L, 9L, 6L, 2L, 4L, 3L),
.Label = c("HGNC:16705", "HGNC:17264", "HGNC:20194",
"HGNC:20454", "HGNC:25532", "HGNC:25817",
"HGNC:26184", "HGNC:26776", "HGNC:28981",
"HGNC:30758"), class = "factor"),
symbol = structure(c(1L, 5L, 6L, 7L, 8L,
9L, 10L, 2L, 3L, 4L), .Label = c("MTPAP", "POLR1A",
"POLR1B", "POLR1C", "TENT2", "TENT4A", "TENT4B",
"TUT1", "TUT4", "TUT7"), class = "factor")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
One option is to join the two datasets on the 'symbol' column and then dcast from 'long' to 'wide' specifying the fun.aggregate as length
library(data.table)
setDT(df_2)[df_1, Sample_name := Sample_name, on = .(symbol)]
df_2[, symbol := factor(symbol, levels = unique(symbol))]
dcast(df_2, symbol ~ factor(Sample_name, levels = paste0("sample_",
1:3)), length, drop = FALSE)
# symbol sample_1 sample_2 sample_3
# 1: MTPAP 1 0 0
# 2: TENT2 1 0 0
# 3: TENT4A 0 0 0
# 4: TENT4B 0 0 0
# 5: TUT1 0 0 0
# 6: TUT4 0 0 0
# 7: TUT7 0 0 0
# 8: POLR1A 0 0 1
# 9: POLR1B 0 0 0
#10: POLR1C 0 0 0
data
df_1 <- structure(list(symbol = c("MTPAP", "MTPAP", "MTPAP", "TENT2",
"KIDINS220", "POLR1A", "CCDC138", "CCDC74A", "ATF2", "TLR9"),
Sample_name = c("sample_1", "sample_1", "sample_1", "sample_1",
"sample_2", "sample_3", "sample_4", "sample_5", "sample_6",
"sample_7")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))
df_2 <- structure(list(HGNC.ID = c("HGNC:25532", "HGNC:26776", "HGNC:16705",
"HGNC:30758", "HGNC:26184", "HGNC:28981", "HGNC:25817", "HGNC:17264",
"HGNC:20454", "HGNC:20194"), symbol = c("MTPAP", "TENT2", "TENT4A",
"TENT4B", "TUT1", "TUT4", "TUT7", "POLR1A", "POLR1B", "POLR1C"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10"))
I know you have two answers already, but here's how I would do it :)
The data:
df_1 <- read.table(text = "symbol Sample_name
1 MTPAP sample_1
2 MTPAP sample_1
3 MTPAP sample_1
4 TENT2 sample_1
5 KIDINS220 sample_2
6 POLR1A sample_3
7 CCDC138 sample_4
8 CCDC74A sample_5
9 ATF2 sample_6
10 TLR9 sample_7", header = TRUE,
stringsAsFactors =T)
df_2 <- read.table(text = " HGNC.ID symbol
1 HGNC:25532 MTPAP
2 HGNC:26776 TENT2
3 HGNC:16705 TENT4A
4 HGNC:30758 TENT4B
5 HGNC:26184 TUT1
6 HGNC:28981 TUT4
7 HGNC:25817 TUT7
8 HGNC:17264 POLR1A
9 HGNC:20454 POLR1B
10 HGNC:20194 POLR1C", header= TRUE,
stringsAsFactors =TRUE)
First the empty matrix (not really empty but filled with the default value ... 0):
mat <- matrix(0,
ncol = length(unique(df_1$Sample_name)),
nrow = length(unique(df_2$symbol)),
dimnames = list(unique(df_2$symbol),
unique(df_1$Sample_name)))
Make a table out of the symbols and samples in df_1:
library(dplyr)
mat_2 <- df_1 %>%
unique %>% table
now we get the rows we want from mat_2 using row names
wanted_rows <- rownames(mat_2)[rownames(mat_2) %in% df_2$symbol]
mat[wanted_rows,] <- mat_2[wanted_rows,]
I have two matrices similar to below:
a b c d id1 id2 id3 id4
1 2 3 4 b 1 2 3
6 7 8 9 c 0 2 4
d 1 2 2
a 5 6 8
The expected out put is as follow :
b c d a
[1,] 2 3 4 1
[2,] 7 8 9 6
And I want to sort the row name of the left matrix according to the first column (id1) of the second matrix.
Does anyone has any suggestion that how can it be processed ?
I was simply trying "first matrix"[colnames("second matrix"),]. but it was not that easy.
Thanks
You could do
first_mat[,second_mat[, 1]]
# b c d a
#[1,] 2 3 4 1
#[2,] 7 8 9 6
data
first_mat <- structure(c(1L, 6L, 2L, 7L, 3L, 8L, 4L, 9L), .Dim = c(2L,
4L), .Dimnames = list(
NULL, c("a", "b", "c", "d")))
second_mat <- structure(c("b", "c", "d", "a", "1", "0", "1", "5", "2", "2",
"2", "6", "3", "4", "2", "8"), .Dim = c(4L, 4L), .Dimnames = list(
NULL, c("id1", "id2", "id3", "id4")))
I have a data frame with three variables and I want the first variable to be the row names, the second variable to be the column names, and the third variable to be the values associated with those two parameters, with NA or blank where data may be missing. Is this easy/possible to do in R?
example input
structure(list(
Player = c("1","1","2","2","3","3","4","4","5","5","6"),
Type = structure(c(2L, 1L, 2L, 1L, 2L, 1L,2L, 1L, 2L, 1L, 1L),
.Label = c("Long", "Short"), class = "factor"),
Yards = c("23","41","50","29","11","41","48","12","35","27","25")),
.Names = c("Player", "Type", "Yards"),
row.names = c(NA, 11L),
class = "data.frame")
Using the sample data you gave:
df <- structure(list(Player = c("1", "1", "2", "2", "3", "3", "4", "4", "5",
"5", "6"), Type = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L),
.Label = c("Long", "Short"), class = "factor"),
Yards = c("23", "41", "50", "29", "11", "41", "48", "12", "35", "27", "25")),
.Names = c("Player", "Type", "Yards"), row.names = c(NA, 11L),
class = "data.frame")
Player Type Yards
1 1 Short 23
2 1 Long 41
3 2 Short 50
4 2 Long 29
5 3 Short 11
6 3 Long 41
7 4 Short 48
8 4 Long 12
9 5 Short 35
10 5 Long 27
11 6 Long 25
dcast will be able to tabulate the two variables.
library(reshape2)
df.cast <- dcast(df, Player~Type, value.var="Yards")
The Player column will be a column, so you need to do a bit extra to make it the row names of the data.frame
rownames(df.cast) <- df.cast$Player
df.cast$Player <- NULL
Long Short
1 41 23
2 29 50
3 41 11
4 12 48
5 27 35
6 25 <NA>