Delete all rows coresponding to given ID - r

Data overview:
> str(dataStart[c("gvkey","DEF","FittedRob","NewCol")])
'data.frame': 1000 obs. of 4 variables:
$ gvkey : int 1004 1004 1004 1004 1004 1021 1021 1021 1021 1033 ...
$ DEF : int 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0...
$ FittedRob : num 0.549 0.532 0.519 0.539 0.531 ...
$ NewCol : chr 0.549 "Del" 0.519 0.539 "Del2" ...
Now, I would like to delete all rows where "Del" or "Del2" occur and thats for given "gvkey".
dataStart <- NewDataFrame[ ! NewDataFrame$NewCol %in% c("Del","Del2"),]
Where NewDataFrame is the data.frame containing the NewCol. This however deletes only the rows where "Del" and "Del2" occur, I would like to have deleted the whole "gvkey" if either "Del" or "Del2" occur. Thanks.

You first have to select all gvkey's you want to delete:
keys_to_delete <- unique(NewDataFrame$gvkey[NewDataFrame$NewCol %in%
c("Del","Del2")])
And then use these to delete the corresponding rows:
dataStart <- NewDataFrame[!(NewDataFrame$gvkey %in% keys_to_delete), ]

set.seed(42)
DF <- data.frame(a = sample(c("a", "b", "c"), 10, T), b = sample(1:10, 10, T))
# a b
# 1 c 5
# 2 c 8
# 3 a 10
# 4 c 3
# 5 b 5
# 6 b 10
# 7 c 10
# 8 a 2
# 9 b 5
# 10 c 6
library(plyr)
res <- ddply(DF, .(a), transform, test = any(b %in% c(2, 3)))
res[!res$test, 1:2]
# a b
# 3 b 5
# 4 b 10
# 5 b 5

Use a bit of ave action, using the example data #Roland used:
DF[ave(DF$b,DF$a, FUN=function(x) !any(x %in% c(2,3)))==1,]
And an adaptation of Jan's nice answer:
DF[!DF$a %in% unique(DF$a[DF$b %in% c(2,3)]) ,]
Both giving:
a b
5 b 5
6 b 10
9 b 5

Related

Create a matrix symmetric from long table in r

This is my data
df <- data.frame (Var1 <- c("a", "b", "c","d","e","b","c","d","e","c","d","e","d","e","e"),
Var2 <- c("a","a","a","a","a","b","b","b","b","c","c","c","d","d","e")
pre <- c(1,2,3,4,5,1,6,7,8,1,9,10,1,11,1) )
I would like to build a symmetric matrix with Var1 and Var2 function as rownames and colnames, and the matrix values are the Corresponding number in "pre" in r, like this:
a b c d e
a 1 2 3 4 5
b 2 1 6 7 8
c 3 6 1 9 10
d 4 7 9 1 11
e 5 8 10 11 1
This seems to be an easy problem, but I have googled a lot of posts, but it has not been solved, so I come here to ask, thank you!
Mengying
You can get the data in wide format first.
library(dplyr)
library(tidyr)
mat <- df %>%
pivot_wider(names_from = Var2, values_from = pre, values_fill = 0) %>%
column_to_rownames('Var1') %>%
as.matrix()
mat
# a b c d e
#a 1 0 0 0 0
#b 2 1 0 0 0
#c 3 6 1 0 0
#d 4 7 9 1 0
#e 5 8 10 11 1
Since you have a symmetric matrix you can copy the lower triangular matrix to upper triangle.
mat[upper.tri(mat)] <- t(mat)[upper.tri(mat)]
mat
# a b c d e
#a 1 2 3 4 5
#b 2 1 6 7 8
#c 3 6 1 9 10
#d 4 7 9 1 11
#e 5 8 10 11 1
data
df <- data.frame (Var1 = c("a", "b", "c","d","e","b","c","d","e","c","d","e","d","e","e"),
Var2 = c("a","a","a","a","a","b","b","b","b","c","c","c","d","d","e"),
pre = c(1,2,3,4,5,1,6,7,8,1,9,10,1,11,1) )
Here is an option with igraph package
g <- graph_from_data_frame(df,directed = FALSE)
E(g)$pre <- df$pre
get.adjacency(g,attr = "pre")
which gives
a b c d e
a 1 2 3 4 5
b 2 1 6 7 8
c 3 6 1 9 10
d 4 7 9 1 11
e 5 8 10 11 1
Base R solution (using data provided by Ronak):
# Crosstab:
mdat <- as.data.frame.matrix(xtabs(pre ~ Var1 + Var2, df))
# Reflect on the diag (thanks #Ronak Shah):
mdat[upper.tri(mdat)] <- t(mdat)[upper.tri(mdat)]
As #ThomasIsCoding points as well we can use this one-liner:
xtabs(pre ~ ., unique(rbind(df, cbind(setNames(rev(df[-3]), names(df)[-3]), df[3] ))))
As #thelatemail points out we can also:
xtabs(pre ~ ., unique(data.frame(Map(c, df, df[c(2,1,3)]))))
Here's a base R version:
df <- data.frame (Var1 = c("a", "b", "c","d","e","b","c","d","e","c","d","e","d","e","e"),
Var2 = c("a","a","a","a","a","b","b","b","b","c","c","c","d","d","e"),
pre = c(1,2,3,4,5,1,6,7,8,1,9,10,1,11,1))
# Generate new matrix/data frame
mat2 <- matrix(0, length(unique(df$Var1)), length(unique(df$Var2)))
# Name the columns and rows so we can access values
rownames(mat2) <- unique(df$Var1)
colnames(mat2) <- unique(df$Var2)
# Save values into appropriate places into data frame
mat2[as.matrix(df[, 1:2])] <- as.matrix(df[, 3])
# Using upper triangle trick from #Ronak Shah's answer
mat2[upper.tri(mat)] <- t(mat2)[upper.tri(mat2)]
# See results
mat2
# a b c d e
# a 1 2 3 4 5
# b 2 1 6 7 8
# c 3 6 1 9 10
# d 4 7 9 1 11
# e 5 8 10 11 1

Reduce number of levels for each factor dplyr approach

I am trying to reduce the number of levels in each factor variable in my data. I want to reduce the number of levels doing 2 operations:
If the number of levels is larger than a cut-off then replace the less frequent levels to a new level until the number of levels has reached the cut-off
Replace levels in a factor with not enough observations to a new level
I wrote a function which works fine, but I don't like the code. It does not matter if the level REMAIN has not enough observations. I prefer a dplyr approach.
ReplaceFactor <- function(data, max_levels, min_values_factor){
# First make sure that not to many levels are in a factor
for(i in colnames(data)){
if(class(data[[i]]) == "factor"){
if(length(levels(data[[i]])) > max_levels){
levels_keep <- names(sort(table(data[[i]]), decreasing = T))[1 : (max_levels - 1)]
data[!get(i) %in% levels_keep, (i) := "REMAIN"]
data[[i]] <- as.factor(as.character(data[[i]]))
}
}
}
# Now make sure that in each level has enough observations
for(i in colnames(data)){
if(class(data[[i]]) == "factor"){
if(min(table(data[[i]])) < min_values_factor){
levels_replace <- table(data[[i]])[table(data[[i]]) < min_values_factor]
data[get(i) %in% names(levels_replace), (i) := "REMAIN"]
data[[i]] <- as.factor(as.character(data[[i]]))
}
}
}
return(data)
}
df <- data.frame(A = c("A","A","B","B","C","C","C","C","C"),
B = 1:9,
C = c("A","A","B","B","C","C","C","D","D"),
D = c("A","B","E", "E", "E","E","E", "E", "E"))
str(df)
'data.frame': 9 obs. of 4 variables:
$ A: Factor w/ 3 levels "A","B","C": 1 1 2 2 3 3 3 3 3
$ B: int 1 2 3 4 5 6 7 8 9
$ C: Factor w/ 4 levels "A","B","C","D": 1 1 2 2 3 3 3 4 4
$ D: Factor w/ 3 levels "A","B","E": 1 2 3 3 3 3 3 3 3
dt2 <- ReplaceFactor(data = data.table(df),
max_levels = 3,
min_values_factor = 2)
str(dt2)
Classes ‘data.table’ and 'data.frame': 9 obs. of 4 variables:
$ A: Factor w/ 3 levels "A","B","C": 1 1 2 2 3 3 3 3 3
$ B: int 1 2 3 4 5 6 7 8 9
$ C: Factor w/ 3 levels "A","C","REMAIN": 1 1 3 3 2 2 2 3 3
$ D: Factor w/ 2 levels "E","REMAIN": 2 2 1 1 1 1 1 1 1
- attr(*, ".internal.selfref")=<externalptr>
dt2
A B C D
1: A 1 A REMAIN
2: A 2 A REMAIN
3: B 3 REMAIN E
4: B 4 REMAIN E
5: C 5 C E
6: C 6 C E
7: C 7 C E
8: C 8 REMAIN E
9: C 9 REMAIN E
Using forcats:
library(dplyr)
library(forcats)
max_levels <- 3
min_values_factor <- 2
df %>%
mutate_if(is.factor, fct_lump, n = max_levels,
other_level = "REMAIN", ties.method = "first") %>%
mutate_if(is.factor, fct_lump, prop = (min_values_factor - 1) / nrow(.),
other_level = "REMAIN")
# A B C D
# 1 A 1 A REMAIN
# 2 A 2 A REMAIN
# 3 B 3 B E
# 4 B 4 B E
# 5 C 5 C E
# 6 C 6 C E
# 7 C 7 C E
# 8 C 8 REMAIN E
# 9 C 9 REMAIN E
(Oh, and I wasn't able to replicate the exact behavior of your function, but you might get what you want by tweaking ties.method and substracting 1 to max_levels).

Selecting rows by offsetting

I have this data frame, lets call it my_df.
It looks like this:
my_df <- data.frame(rnorm(n = 30,sd=.5),rep(c("a","b","c"),each=10))
names(my_df) <- c("num","let")
head(my_df)
num let
1 0.01202600 a
2 1.09025768 a
3 -0.08656178 a
4 -0.04847073 a
5 -0.63750258 a
6 0.58846135 a
What I want to do is select all of the rows when my_df$let == "b" as well as the five rows before the first row when my_df$let == "b", and the five rows after the last row when my_df == "b". So basically my_df[6:25,].
The data I'm actually working with is hundreds of thousands of lines long and I don't know what rows is what, and besides that each set of data doesn't match up row wise and I can't take the time to go through each set of data individually. I've been using a subset to select the data I want, but I don't know how to select the additional rows outside of the subset (1000 rows before and after).
Here's my subset for what I'm doing:
#The following lines seperate pXX_NoNegative into individual field sections
p04_HighWeeds <- subset(p04_NoNegative, subset = p04_NoNegative$GS_Field == "High Weeds")
I want to select all of the rows that the above code selects, but I also want 100 rows before that, and 1000 rows after that.
If you need any additional information that may help you please ask.
Here's another idea using dplyr:
library(dplyr)
my_df %>% filter(lead(let == "b", 5) | lag(let == "b", 5))
Or as per #akrun suggestion using the devel version of data.table:
setDT(my_df)[shift(let == "b", 5) | shift(let == "b", type = "lead", 5)]
Which gives:
# num let
#1 0.36723709 a
#2 0.24743170 a
#3 -0.33339924 a
#4 -0.57024317 a
#5 0.03390278 a
#6 -0.43495096 b
#7 -0.85107347 b
#8 0.53048931 b
#9 -0.26739611 b
#10 -0.96029355 b
#11 -0.71737408 b
#12 0.34324685 b
#13 0.12319646 b
#14 0.75207703 b
#15 0.18134006 b
#16 -0.02230777 c
#17 0.42646106 c
#18 -0.11055478 c
#19 0.06013187 c
#20 0.50782158 c
Normally splitting a data frame into a list of data frames based on some categorization is straightforward -- you would use split(my_df, my_df$let) in your case. However with the added complication that you want some number of rows before or after I would operate over the set of unique categorizations, selecting the rows you want in each case:
before <- 5
after <- 5
ret <- setNames(lapply(unique(my_df$let), function(x) {
positions <- which(my_df$let == x)
start.pos <- max(1, min(positions)-before)
end.pos <- min(nrow(my_df), max(positions)+after)
my_df[start.pos:end.pos,]
}), unique(my_df$let))
You can grab the observations for any category you want out of the returned list:
ret$b # Also works: ret[["b"]]
# num let
# 6 -0.197901427 a
# 7 0.194607192 a
# 8 -0.107318203 a
# 9 -0.365313233 a
# 10 -0.188926562 a
# 11 0.636272295 b
# 12 -0.058791973 b
# 13 -0.231029510 b
# 14 0.519441716 b
# 15 0.239510912 b
# 16 0.107025658 b
# 17 -0.446644081 b
# 18 0.145052077 b
# 19 -0.426090749 b
# 20 -0.356062993 b
# 21 -0.155012203 c
# 22 -0.007968255 c
# 23 -0.504253089 c
# 24 0.081624303 c
# 25 -0.657008233 c
I recently answered a nearly identical question: Select n rows after specific number. Adapting the single-segment solution to your data:
set.seed(1); my_df <- data.frame(rnorm(n = 30,sd=.5),rep(c("a","b","c"),each=10));
names(my_df) <- c("num","let");
brange <- range(which(my_df$let=='b'));
my_df$offb <- c((1-brange[1]):-1,rep(0,diff(brange)+1),1:(nrow(my_df)-brange[2]));
my_df;
## num let offb
## 1 -0.313226905 a -10
## 2 0.091821662 a -9
## 3 -0.417814306 a -8
## 4 0.797640401 a -7
## 5 0.164753886 a -6
## 6 -0.410234192 a -5
## 7 0.243714526 a -4
## 8 0.369162353 a -3
## 9 0.287890676 a -2
## 10 -0.152694194 a -1
## 11 0.755890584 b 0
## 12 0.194921618 b 0
## 13 -0.310620290 b 0
## 14 -1.107349944 b 0
## 15 0.562465459 b 0
## 16 -0.022466805 b 0
## 17 -0.008095132 b 0
## 18 0.471918105 b 0
## 19 0.410610598 b 0
## 20 0.296950661 b 0
## 21 0.459488686 c 1
## 22 0.391068150 c 2
## 23 0.037282492 c 3
## 24 -0.994675848 c 4
## 25 0.309912874 c 5
## 26 -0.028064370 c 6
## 27 -0.077897753 c 7
## 28 -0.735376192 c 8
## 29 -0.239075028 c 9
## 30 0.208970780 c 10
subset(my_df,offb>=-5&offb<=5);
## num let offb
## 6 -0.410234192 a -5
## 7 0.243714526 a -4
## 8 0.369162353 a -3
## 9 0.287890676 a -2
## 10 -0.152694194 a -1
## 11 0.755890584 b 0
## 12 0.194921618 b 0
## 13 -0.310620290 b 0
## 14 -1.107349944 b 0
## 15 0.562465459 b 0
## 16 -0.022466805 b 0
## 17 -0.008095132 b 0
## 18 0.471918105 b 0
## 19 0.410610598 b 0
## 20 0.296950661 b 0
## 21 0.459488686 c 1
## 22 0.391068150 c 2
## 23 0.037282492 c 3
## 24 -0.994675848 c 4
## 25 0.309912874 c 5

R: data transformation of a column in data frame

i have a data.frame as belows
> a <- c(98:103, 998:1003)
> b <- 1:length(a)
> data <- data.frame(a,b)
> data
a b
1 98 1
2 99 2
3 100 3
4 101 4
5 102 5
6 103 6
7 998 7
8 999 8
9 1000 9
10 1001 10
11 1002 11
12 1003 12
I would like to add a column based on column a.
for column a less than 100, i will assign "A" to the new column
for column a in <1000 >=100, i will assign "B" to the new column
and "C" otherwise
My approach is
> data$c <- data$a
>
> A <- 1:99
> B <- 100:999
> for (i in 1:length(a)){
+ if (data[i,1] %in% A){
+ data[i,3] <- "A"
+ } else if (data[i,1] %in% B){
+ data[i,3] <- "B"
+ } else {data[i,3] <- "C"}
+ }
> data
a b c
1 98 1 A
2 99 2 A
3 100 3 B
4 101 4 B
5 102 5 B
6 103 6 B
7 998 7 B
8 999 8 B
9 1000 9 C
10 1001 10 C
11 1002 11 C
12 1003 12 C
>
While my real data with over 500,000 rows. May i have better solution?
Find below a solution using data.table. This version might be especially useful if your key variable (here a) is not numeric.
# Set up data
a <- c(98:103, 998:1003)
b <- 1:length(a)
# Set of values to look for
A <- 1:99
B <- 100:999
# Create data table and set key
DT <- data.table(a,b)
setkey(DT, a)
# Add new variable
DT[J(A), c:="a"]
DT[J(B), c:="b"]
DT[is.na(DT$c), c:="c"]
If your key variable is not numeric, you can change DT[J(A), c:="a"] to DT[A,c:="a"].

Convert character variable column to vector

Have a .csv with 4 columns of values:
data<-read.csv("C:\\Users\\mtatange\\Desktop\\Dataset.csv")
A B C D
1 1 NA 1
2 2 4 1
3 3 6 4
4 NA 8 5
data$E<-do.call(paste,c(data[c("A","B","C","D")], sep=""))
data
A B C D E
1 1 NA 1 11NA1
2 2 4 1 2241
3 3 6 4 3364
4 NA 8 5 4NA85
summary(data)
E
Length: 4
Class: Character
Mode: Character
I need column "E" to be a vector, it cannot stay as a character variable. I tried:
data$E[is.na(a$E)]<-0
But that still left the column as a character variable. How do I convert the column to a vector variable?
Get rid of the NA's first..:
df[ is.na(df) ] <- 0
df$E <- apply(df,1,function(x) as.numeric(paste0(x , collapse="")))
A B C D E
1 1 1 0 1 1101
2 2 2 4 1 2241
3 3 3 6 4 3364
4 4 0 8 5 4085
apply(df , 2 , class )
A B C D E
"numeric" "numeric" "numeric" "numeric" "numeric"
The solution above gives you the idea. Alternatively, a (relatively) faster way of doing this is:
df[ is.na(df) ] <- 0
df$E <- as.numeric(do.call(paste0, df))
And replacement of NA's is very fast. In a test on a 3 column table with 300,000 rows on a MBP laptop...
df <- data.frame( a = sample(c(1:9,NA) , 3e5 , repl = TRUE ) , b = sample(c(1:9,NA) , 3e5 , repl = TRUE ) , c = sample(c(1:9,NA) , 3e5 , repl = TRUE ) )
sum(is.na(df))
[1] 90118
system.time( (df[is.na(df)] <- 0 ) )
user system elapsed
0.250 0.021 0.269
nrow(df)
[1] 300000

Resources