How to move two specific rows to top of dataframe? - r

Below I have a DF.
A B C D
a 4 2 2
g 5 2 2
d 7 65 7
e 3 6 7
I would like to make this DF so that column A has "g" in the first row, and "d" in the second row. I would like to do this by calling the value in column A (rather than an index). How can I do this?
Ideal output
A B C D
g 5 2 2
d 7 65 7
a 4 2 2
e 3 6 7

We may convert to factor with levels specified in an order before arrangeing
library(forcats)
library(dplyr)
DF %>%
arrange(fct_relevel(A, 'g', 'd'))
A B C D
1 g 5 2 2
2 d 7 65 7
3 a 4 2 2
4 e 3 6 7
with fct_relevel, we can specify the order of specific levels without specifying the rest of the levels
> with(DF, fct_relevel(A, 'g', 'd'))
[1] a g d e
Levels: g d a e
data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA,
-4L))

Another possible solution:
library(dplyr)
df <- data.frame(
stringsAsFactors = FALSE,
A = c("a", "g", "d", "e"),
B = c(4L, 5L, 7L, 3L),
C = c(2L, 2L, 65L, 6L),
D = c(2L, 2L, 7L, 7L)
)
df %>% arrange(match(A, c("g", "d", setdiff(c("g", "d"), A))))
#> A B C D
#> 1 g 5 2 2
#> 2 d 7 65 7
#> 3 a 4 2 2
#> 4 e 3 6 7

Try the code below
with(
df,
df[match(c("g","d",A[!A%in%c("g","d")]),A),]
)
and you will see
A B C D
2 g 5 2 2
3 d 7 65 7
1 a 4 2 2
4 e 3 6 7

Just to add a base R solution if you are not interested in external packages, you can specify the row order directly:
# Sample Data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA, -4L))
A hard code for this example:
DF2 <- DF[c(2,3,1,4),]
A more generalizable example:
# specify desired rows
rownums <- which(DF$A %in% c("g","d"), arr.ind = TRUE)
# Specify other rows
otherrows <- seq(1:nrow(DF))[!(seq(1:nrow(DF)) %in% rownums)]
# Organize
DF2 <- DF[c(rownums,otherrows),]

Related

How to select lines which have equal values in columns and mantain this characteristics

I have a complete data frame of all cities from Brazil. I want just some predefined cities. I have a column with these predefined cities. Then I'd like to use all the columns from my data frame, but select only the lines which coincides the cities of column with all cities and the column with predefined cities.
data = read.csv(file="C:/Users/guilherme/Desktop/data.csv", header=TRUE, sep=";")
data
> AllCities Year1990 Year200 PredefinedCities CharacCities1 CharacCities2
1 A 2 4 C 12 5
2 B 2 2 A 11 10
3 C 3 4 F 09 2
4 D 4 2
5 E 5 6
6 F 6 2
I want the following
> data
AllCities Year1990 Year200 PredefinedCities CharacCities1 CharacCities2
1 C 3 4 C 12 5
2 A 2 4 A 11 10
3 F 6 2 F 09 2
You need merge -
merge(
data[, c("AllCities", "Year1990", "Year200")],
data[, c("PredefinedCities", "CharacCities1", "CharacCities2")],
by.x = "AllCities", by.y = "PredefinedCities"
)
AllCities Year1990 Year200 CharacCities1 CharacCities2
1 A 2 4 11 10
2 C 3 4 12 5
3 F 6 2 9 2
Note - Your data format is unusual. If you can, you should fix data source so that it gives you AllCities and PreferredCities tables separately or maybe even join them correctly before creating the csv file.
Data -
structure(list(AllCities = c("A", "B", "C", "D", "E", "F"), Year1990 = c(2L,
2L, 3L, 4L, 5L, 6L), Year200 = c(4L, 2L, 4L, 2L, 6L, 2L), PredefinedCities = c("C",
"A", "F", "", "", ""), CharacCities1 = c(12L, 11L, 9L, NA, NA,
NA), CharacCities2 = c(5L, 10L, 2L, NA, NA, NA)), .Names = c("AllCities",
"Year1990", "Year200", "PredefinedCities", "CharacCities1", "CharacCities2"
), class = "data.frame", row.names = c(NA, -6L))
data <- data[data$AllCities %in% data$PredefinedCities,]

merge and get max value from two different datatables in R

I've 2 different data.tables. I need to merge and get max value based on a row values. The examples of two tables are given as Input below and expected output shown below.
Input
Table 1
X A B
A 3
B 4 6
C 5
D 9 12
Table 2
X A B
A 1 5
B 6 8
C 7 14
D 5
E 1 1
F 2 3
G 5 6
Expected Output:
X A B
A 3 5
B 6 8
C 7 14
D 9 12
E 1 1
F 2 3
G 5 6
We can rbind the two datasets and do a group by max
library(data.table)
rbindlist(list(tbl1, tbl2))[, lapply(.SD, max, na.rm = TRUE), X]
# X A B
#1: A 3 5
#2: B 6 8
#3: C 7 14
#4: D 9 12
#5: E 1 1
#6: F 2 3
#7: G 5 6
If we are using base R, then use aggregate after rbinding the datasets
aggregate(.~ X, rbind(tbl1, tbl2), max, na.rm = TRUE, na.action = NULL)
NOTE: Assume that the 'A', 'B' columns are numeric and blanks are NA
data
tbl1 <- structure(list(X = c("A", "B", "C", "D"), A = c(3L, 4L, 5L, 9L
), B = c(NA, 6L, NA, 12L)), .Names = c("X", "A", "B"), class = "data.frame",
row.names = c(NA, -4L))
tbl2 <- structure(list(X = c("A", "B", "C", "D", "E", "F", "G"), A = c(1L,
6L, 7L, 5L, 1L, 2L, 5L), B = c(5L, 8L, 14L, NA, 1L, 3L, 6L)), .Names = c("X",
"A", "B"), class = "data.frame",
row.names = c(NA, -7L))

How to collapse session path data into from-to paths for visualizing network data?

What are some ways to transform session path data such as this:
df
# Session Link1 Link2 Link3 Link4 Link5
# 1 1 A B
# 2 2 C
# 3 3 D A B
# 4 4 C F G H J
# 5 5 A B C
Into a data set that looks like this:
desired
# Session From To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C
Data for reproducibility:
df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))
We could use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Reshape from 'wide' to 'long' format with melt specifying the id.var as 'Session'. Remove the 'value' elements that are empty [value!='']. Grouped by 'Session', we insert 'NA' values in the 'value' column for those 'Session' that have only a single row (if...else), create a two columns ('From' and 'To') by removing the last and first element of 'V1' grouped by 'Session'.
library(data.table)#v1.9.5+
melt(setDT(df), id.var='Session')[value!=''][,
if(.N==1L) c(value, NA) else value, by = Session][,
list(From=V1[-.N], To=V1[-1L]), by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
The above could be simplified to a single block after the melt step. For some reason, tmp[-.N] is not working. So I used tmp[1:(.N-1)].
melt(setDT(df), id.var= 'Session')[value!='', {
tmp <- if(.N==1L) c(value, NA) else value
list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
Inspired by #akrun, this is my personal stab at the problem. Granted, the results are tweaked to include the terminal from-to path for each pair:
library(dplyr)
library(tidyr)
gather(df, "Link_Num", "Value", -Session) %>%
group_by(Session) %>%
mutate(to = Value,
from = lag(to)) %>%
filter(Link_Num != "Link1" &
from != "") %>%
select(Session, from, to, Link_Num) %>%
arrange(Session)
Which yields:
Session from to Link_Num
1 1 A B Link2
2 1 B Link3
3 2 C Link2
4 3 D A Link2
5 3 A B Link3
6 3 B Link4
7 4 C F Link2
8 4 F G Link3
9 4 G H Link4
10 4 H J Link5
11 5 A B Link2
12 5 B C Link3
13 5 C Link4
Another approach with dplyr functions melt and lead:
library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
arrange(Session) %>%
mutate(To = lead(value)) %>%
filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
# Session value To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C

Multiply a table(file1) with individual cells of a column(file2) using R

File 1:Ele A B C DEs 1 2 3 4Ep 2 4 3 4Ek 1 9 3 8File2:A 1 B 2 C 3 D 5
Need is to ensure that each element under Column A (file 1) gets multiplied by the value assigned to A in file 2 (and so on). I know matrix multiplication in R but this is not the case of matrix multiplication I suppose. Help would be greatly appreciated. Thanks
You could try
indx <- df2$Col1
df1[indx]*df2$Col2[col(df1[indx])]
# A B C D
#1 1 4 9 20
#2 2 8 9 20
#3 1 18 9 40
Or you could use sweep
sweep(df1[indx], 2, df2$Col2, '*')
# A B C D
#1 1 4 9 20
#2 2 8 9 20
#3 1 18 9 40
data
df1 <- structure(list(Ele = c("Es", "Ep", "Ek"), A = c(1L, 2L, 1L),
B = c(2L, 4L, 9L), C = c(3L, 3L, 3L), D = c(4L, 4L, 8L)),
.Names = c("Ele", "A", "B", "C", "D"), class = "data.frame",
row.names = c(NA, -3L))
df2 <- structure(list(Col1 = c("A", "B", "C", "D"), Col2 = c(1L, 2L,
3L, 5L)), .Names = c("Col1", "Col2"), class = "data.frame",
row.names = c(NA, -4L))

R- How to merge multiple dataframes of different lengths?

I have been stuck with this issue for a while now. Need some help.
I am reading the following files (which can be mire than 3 files files) into a dataframe.
My input files look like the following:
file1:
someName someMOD someID
A T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P),S762(P) 1
B S495(P) 2
C S162(P),Q159(D) 3
D S45(P),C47(C),S48(P),S26(P) 4
E S18(P) 5
file2:
someName someMOD someID
C S162(P),Q159(D) 3
D S45(P),C47(C),S48(P),S26(P) 4
F S182(P) 6
E S18(P) 5
Z Q100(P) 9
A T754(P),M691(O),S694(P),S739(P),S740(P) 1
file3:
someName someMOD someID
A T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P) 1
B S495(P) 2
D S45(P),C47(C),S48(P),S26(P) 4
E S18(P) 5
F S182(P) 6
L Z182(P) 8
C S162(P),Q159(D) 3
My Code:
fileList <- dir(pattern="*.xls")
i<-1
j<-1
a<-list()
mybigtable<-data.frame
for (f in 1:length(fileList)){
fileName <- fileList[f]
X <-read.xls(fileName)
if(regexpr("Drug_Rep", fileName)[1]>0){
a[[i]]<-X
}
i=i+1
}
else{
#Don't do anything
}
}
#Now i want to merge my dataframes
mymerge <- function(x, y)
merge(x, y, by=c("someName", "someID"), all=TRUE))
Reduce(mymerge,a) #passing my list of dataframes 'a'
I did dput() on my 'a' list:
list(structure(list(someName = structure(c(1L, 2L, 4L, 5L, 6L,
7L, 3L), .Label = c("A", "B", "C", "D", "E", "F", "L"), class = "factor"),
someMOD = structure(c(6L, 5L, 4L, 2L, 3L, 7L, 1L), .Label = c("S162(P),Q159(D)",
"S18(P)", "S182(P)", "S45(P),C47(C),S48(P),S26(P)", "S495(P)",
"T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P)",
"Z182(P)"), class = "factor"), someID = c(1L, 2L, 4L, 5L,
6L, 8L, 3L)), .Names = c("someName", "someMOD", "someID"), class = "data.frame", row.names = c(NA,
-7L)), structure(list(someName = structure(1:5, .Label = c("A",
"B", "C", "D", "E"), class = "factor"), someMOD = structure(c(5L,
4L, 1L, 3L, 2L), .Label = c("S162(P),Q159(D)", "S18(P)", "S45(P),C47(C),S48(P),S26(P)",
"S495(P)", "T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P),S762(P)"
), class = "factor"), someID = 1:5), .Names = c("someName", "someMOD",
"someID"), class = "data.frame", row.names = c(NA, -5L)), structure(list(
someName = structure(c(2L, 3L, 5L, 4L, 6L, 1L), .Label = c("A",
"C", "D", "E", "F", "Z"), class = "factor"), someMOD = structure(c(2L,
5L, 4L, 3L, 1L, 6L), .Label = c("Q100(P)", "S162(P),Q159(D)",
"S18(P)", "S182(P)", "S45(P),C47(C),S48(P),S26(P)", "T754(P),M691(O),S694(P),S739(P),S740(P)"
), class = "factor"), someID = c(3L, 4L, 6L, 5L, 9L, 1L)), .Names = c("someName",
"someMOD", "someID"), class = "data.frame", row.names = c(NA,
-6L)))
What is my mistake in populating a list? Any help is really appreciated.
I am just trying to get an out put like the following:
The problem with the code I gave you before is that merge gets confused if there are any duplicate column names, and you're merging more than 3 datasets. You'll have to rename your someMOD columns so they don't clash. A for loop works as well as anything for this purpose.
dupvars <- which(!names(a[[1]]) %in% c("someName", "someID"))
for(i in seq_along(a))
names(a[[i]])[dupvars] <- paste0(names(a[[i]])[dupvars], i)
# and then merge
Reduce(mymerge, a)
Perhaps the problem is that you're actually not trying to merge in the standard sense, but reshape. In this case, you can rbind all the data.frames together after adding a "time" variable, and use dcast from "reshape2" to get what you're after:
Add a "time" variable and rbind the data.frames together
temp <- do.call(rbind,
lapply(seq_along(a),
function(x) data.frame(a[[x]], time = x)))
head(temp)
# someName someMOD someID time
# 1 A T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P) 1 1
# 2 B S495(P) 2 1
# 3 D S45(P),C47(C),S48(P),S26(P) 4 1
# 4 E S18(P) 5 1
# 5 F S182(P) 6 1
# 6 L Z182(P) 8 1
Transform the data.frame from a "long" format to a "wide" format
library(reshape2)
dcast(temp, someName + someID ~ time, value.var="someMOD")
# someName someID 1
# 1 A 1 T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P)
# 2 B 2 S495(P)
# 3 C 3 S162(P),Q159(D)
# 4 D 4 S45(P),C47(C),S48(P),S26(P)
# 5 E 5 S18(P)
# 6 F 6 S182(P)
# 7 L 8 Z182(P)
# 8 Z 9 <NA>
# 2
# 1 T754(P),M691(O),S692(P),S694(P),S739(P),S740(P),S759(P),S762(P)
# 2 S495(P)
# 3 S162(P),Q159(D)
# 4 S45(P),C47(C),S48(P),S26(P)
# 5 S18(P)
# 6 <NA>
# 7 <NA>
# 8 <NA>
# 3
# 1 T754(P),M691(O),S694(P),S739(P),S740(P)
# 2 <NA>
# 3 S162(P),Q159(D)
# 4 S45(P),C47(C),S48(P),S26(P)
# 5 S18(P)
# 6 S182(P)
# 7 <NA>
# 8 Q100(P)

Resources