r element frequency and column name - r

I have a dataframe that has four columns A, B, C and D:
A B C D
a a b c
b c x e
c d y a
d z
e
f
I would like to get the frequency of all elements and lists of columns they appear, ordered by the frequency ranking. The output would be something like this:
Ranking frequency column
a 1 3 A, B, D
c 1 3 A, B, D
b 2 2 A, C
d 2 2 A, B
e 2 2 A, D
f .....
I would appreciate any help.
Thank you!

Something like this maybe:
Data
df <- read.table(header=T, text='A B C D
a a b c
b c x e
c d y a
d NA NA z
e NA NA NA
f NA NA NA',stringsAsFactors=F)
Solution
#find unique elements
elements <- unique(unlist(sapply(df, unique)))
#use a lapply to find the info you need
df2 <- data.frame(do.call(rbind,
lapply(elements, function(x) {
#find the rows and columns of the elements
a <- which(df == x, arr.ind=TRUE)
#find column names of the elements found
b <- names(df[a[,2]])
#find frequency
c <- nrow(a)
#produce output
c(x, c, paste(b, collapse=','))
})))
#remove NAs
df2 <- na.omit(df2)
#change column names
colnames(df2) <- c('element','frequency', 'columns')
#order according to frequency
df2 <- df2[order(df2$frequency, decreasing=TRUE),]
#create the ranking column
df2$ranking <- as.numeric(factor(df2$frequency,levels=unique(df2$frequency)))
Output:
> df2
element frequency columns ranking
1 a 3 A,B,D 1
3 c 3 A,B,D 1
2 b 2 A,C 2
4 d 2 A,B 2
5 e 2 A,D 2
6 f 1 A 3
8 x 1 C 3
9 y 1 C 3
10 z 1 D 3
And if you want the elements column to be as row.names and the ranking column to be first you can also do:
row.names(df2) <- df2$element
df2$element <- NULL
df2 <- df2[c('ranking','frequency','columns')]
Output:
> df2
ranking frequency columns
a 1 3 A,B,D
c 1 3 A,B,D
b 2 2 A,C
d 2 2 A,B
e 2 2 A,D
f 3 1 A
x 3 1 C
y 3 1 C
z 3 1 D

Here's an approach using "dplyr" and "tidyr":
library(dplyr)
library(tidyr)
df %>%
gather(var, val, everything()) %>% ## Make a long dataset
na.omit %>% ## We don't need the NA values
group_by(val) %>% ## All calculations grouped by val
summarise(column = toString(var), ## This collapses
freq = n()) %>% ## This counts
mutate(ranking = dense_rank(desc(freq))) %>% ## This ranks
arrange(ranking) ## This sorts
# Source: local data frame [9 x 4]
#
# val column freq ranking
# 1 a A, B, D 3 1
# 2 c A, B, D 3 1
# 3 b A, C 2 2
# 4 d A, B 2 2
# 5 e A, D 2 2
# 6 f A 1 3
# 7 x C 1 3
# 8 y C 1 3
# 9 z D 1 3

Related

How to custom flatten a data frame? [duplicate]

This question already has answers here:
Split comma-separated strings in a column into separate rows
(6 answers)
Closed 5 years ago.
I have a data frame as follows:
df <- data.frame(x=c('a,b,c','d,e','f'),y=c(1,2,3))
df
> df
x y
1 a,b,c 1
2 d,e 2
3 f 3
I can get the flattened df$x like this:
unique(unlist(strsplit(as.character(df$x), ",")))
[1] "a" "b" "c" "d" "e" "f"
What would be the best way to transform my input df into:
x y
a 1
b 1
c 1
d 2
e 2
f 3
Basically flatten df$x and individually assign its corresponding y
If you are working on data.frame, I recommend using tidyr
df <- data.frame(x=c('a,b,c','d,e','f'),y=c(1,2,3),stringsAsFactors = F)
library(tidyr)
df %>%
transform(x= strsplit(x, ",")) %>%
unnest(x)
y x
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f
sapply(unlist(strsplit(as.character(df$x), ",")), function(ss)
df$y[which(grepl(pattern = ss, x = df$x))])
#a b c d e f
#1 1 1 2 2 3
If you want a dataframe
do.call(rbind, lapply(1:NROW(df), function(i)
setNames(data.frame(unlist(strsplit(as.character(df$x[i]), ",")), df$y[i]),
names(df))))
# x y
#1 a 1
#2 b 1
#3 c 1
#4 d 2
#5 e 2
#6 f 3
FWIW, you could also repeat the row indices according to how many elements each x value has:
df <- data.frame(x=c('a,b,c','d,e','f'),y=c(1,2,3),stringsAsFactors = F)
df[,1] <- strsplit(df[,1],",")
cbind(x=unlist(df[,1]),df[rep(1:nrow(df), lengths(df[,1])),-1,F])
# x y
# 1 a 1
# 1.1 b 1
# 1.2 c 1
# 2 d 2
# 2.1 e 2
# 3 f 3

Get first and last value from groups using rle

I want to get first and last value for groups using grouping similar to what rle() function does.
For example I have this data frame:
> df
df time
1 1 A
2 1 B
3 1 C
4 1 D
5 2 E
6 2 F
7 2 G
8 1 H
9 1 I
10 1 J
11 3 K
12 3 L
13 3 M
14 2 N
15 2 O
16 2 P
I want to get something like this:
> want
df first last
1 1 A D
2 2 E G
3 1 H J
4 3 K M
5 2 N P
How you can see, I want to group my values in a way rle() function does. I want to group elements only when this same value is next to each other. group_by groups elements in the different way.
> rle(df$df)
Run Length Encoding
lengths: int [1:5] 4 3 3 3 3
values : num [1:5] 1 2 1 3 2
Is there a solution for my problem? Any advice will be appreciated.
There is a function rleid from data.table that does that job, i.e.
library(data.table)
setDT(dt)[, .(df = head(df, 1),
first = head(time, 1),
last = tail(time, 1)),
by = (grp = rleid(df))][, grp := NULL][]
Which gives,
df first last
1: 1 A D
2: 2 E G
3: 1 H J
4: 3 K M
5: 2 N P
Adding a dplyr approach, as #RonakShah mentions
library(dplyr)
df %>%
group_by(grp = cumsum(c(0, diff(df)) != 0)) %>%
summarise(df = first(df),
first = first(time),
last = last(time)) %>%
select(-grp)
Giving,
# A tibble: 5 x 3
df first last
<int> <chr> <chr>
1 1 A D
2 2 E G
3 1 H J
4 3 K M
5 2 N P
Here is an option using base R with rle. Once we do the rle on the first column, replicate the sequence of values with lengths, use that to create logical index with duplicated, then subset the values of the original dataset based on the index
rl <- rle(df[,1])
i1 <- rep(seq_along(rl$values), rl$lengths)
i2 <- !duplicated(i1)
i3 <- !duplicated(i1, fromLast = TRUE)
wanted <- data.frame(df = df[i2,1], first = df[i2,2], last = df[i3,2])
wanted
# df first last
#1 1 A D
#2 2 E G
#3 1 H J
#4 3 K M
#5 2 N P

Determining if values of previous rows repeat in dataframe

I have some data organized like this:
set.seed(12)
ids <- matrix(replicate(1000,sample(LETTERS[1:4],2)),ncol=2,byrow=T)
df <- data.frame(
event = 1:100,
id1 = ids[,1],
id2 = ids[,2],
grp = rep(1:10, each=100), stringsAsFactors=F)
head(df,10)
event id1 id2 grp
1 1 A C 1
2 2 D A 1
3 3 A D 1
4 4 A B 1
5 5 A D 1
6 6 B C 1
7 7 B D 1
8 8 B D 1
9 9 B D 1
10 10 C A 1
There are pairs of ids (id1 & id2). Within a row they are never the same. There is a variable called grp. There are 10 groups. Each group could be considered a separate sample of data. The event variable goes from 1-100 in each group.
The first question I have is quite straightforward. Within each group, for each row, is the combination of the two ids (id1-id2) the same as the previous row, the reverse of the previous row, or neither of these two options. Obviously, if there is an A-C combination on row 100 of one group, I am not interested in whether it is reversed, the same or whatever on row 1 of the following group.
This is my temporary solution:
#Give each id pair and identifier:
df$pair <- paste(pmin(df$id1,df$id2), pmax(df$id1,df$id2))
#For each grp, work out using `lag` if previous row contains same pair of ids, and if they are in same or reversed order:
df.sp <- split(df, df$grp)
df$value <- unlist(lapply(df.sp, function(x) ifelse(x$pair!=lag(x$pair), NA, ifelse(x$id1==lag(x$id1), 1, 0)) ))
This gives:
head(df,10)
event id1 id2 grp pair value
1 1 A C 1 A C NA
2 2 D A 1 A D NA
3 3 A D 1 A D 0
4 4 A B 1 A B NA
5 5 A D 1 A D NA
6 6 B C 1 B C NA
7 7 B D 1 B D NA
8 8 B D 1 B D 1
9 9 B D 1 B D 1
10 10 C A 1 A C NA
This works - showing 0 as a reversal, 1 as a copy and NA as neither.
The more complex question I am interested in is the following. Within each group (grp), for each row, find if its combination of two ids (the pair) previously occurred in that grp. If they did, then return whether they were in the same order or reversed order the immediate previous time they occurred.
That result would look like this:
event id1 id2 grp pair value
1 1 A C 1 A C NA
2 2 D A 1 A D NA
3 3 A D 1 A D 0
4 4 A B 1 A B NA
5 5 A D 1 A D 1
6 6 B C 1 B C NA
7 7 B D 1 B D NA
8 8 B D 1 B D 1
9 9 B D 1 B D 1
10 10 C A 1 A C 0
e.g. row 10 is returned as a 0 because the combination A-C previously occurred and was in the reverse order (row 1). on row 5 a 1 is returned as A-D previously occurred in the same order on row 3.
You're almost there! The second question is equivalent to the first question, just grouping by pair as well as group. I converted the code to dplyr (though I appreciate the spirit behind keeping the question in base). I also removed the second ifelse, replacing it with a numeric conversion of the logical, which should be more performant (and some will find easier to read).
df %>% group_by(grp) %>%
mutate(
pair = paste(pmin(id1, id2), pmax(id1, id2)),
prev_row = ifelse(pair != lag(pair), NA, as.numeric(id1 == lag(id1)))
) %>%
group_by(grp, pair) %>%
mutate(prev_any = ifelse(pair != lag(pair), NA, as.numeric(id1 == lag(id1)))) %>%
head(10)
# Source: local data frame [10 x 7]
# Groups: grp, pair [5]
#
# event id1 id2 grp pair prev_row prev_any
# (int) (chr) (chr) (int) (chr) (dbl) (dbl)
# 1 1 A C 1 A C NA NA
# 2 2 D A 1 A D NA NA
# 3 3 A D 1 A D 0 0
# 4 4 A B 1 A B NA NA
# 5 5 A D 1 A D NA 1
# 6 6 B C 1 B C NA NA
# 7 7 B D 1 B D NA NA
# 8 8 B D 1 B D 1 1
# 9 9 B D 1 B D 1 1
# 10 10 C A 1 A C NA 0
For such grouping, filtering and mutating tasks, I find dplyr to be very helpful. Here is one way I came up with how you can achieve your goal:
df %>% group_by(grp) %>% mutate(value = ifelse(id1 == lag(id1) & id2 == lag(id2), 1, ifelse(id1 == lag(id2) & id2 == lag(id1), 0, NA)))
Within each group, you compare the ID values and conditionally assign a new value column. Hope this helps.

How to get rows with min values in one column, grouped by other column, while keeping other columns?

I have the following data:
df <- data.frame(A = c(1,2,3,4,5,6), B=c("P","P","P","Q","Q","Q"), C=c("a","b","c","d","e","f"))
df
## A B C
## 1 1 P a
## 2 2 P b
## 3 3 P c
## 4 4 Q d
## 5 5 Q e
## 6 6 Q f
I want to somehow get the rows with the minimum value in A for each distinct B, but also with the corresponding value in C. e.g.
## A B C
## 1 1 P a
## 4 4 Q d
I tried the following, but neither does what I would want:
> aggregate(df[c('A')], by=df[c('B')], FUN=min)
B A
1 P 1
2 Q 4
> aggregate(df[c('A')], by=df[c('B','C')], FUN=min)
B C A
1 P a 1
2 P b 2
3 P c 3
4 Q d 4
5 Q e 5
6 Q f 6
You can try
library(dplyr)
df %>%
group_by(B) %>%
filter(A==min(A))
# A B C
#1 1 P a
#2 4 Q d
Or
library(data.table)
setDT(df)[, .SD[A==min(A)], B]
Or using base R
df[with(df, ave(A, B, FUN=min)==A),]
# A B C
#1 1 P a
#4 4 Q d
you can also use the split-apply technique:
# split `df` on the field 'b'
tmp <- split(df,df$B)
# reduce to the row with the minimum value of A
tmp <- lapply(tmp,function(x)
x[x$A == min(x$A),])
# bind the rows together
do.call(rbind,tmp)
#> A B C
#> P 1 P a
#> Q 4 Q d

Assigning rows of data.frame to another data.frame in R based on frequency of element's occurance

I have a data.frame df
> df
V1 V2
1 a b
2 a e
3 a f
4 b c
5 b e
6 b f
7 c d
8 c g
9 c h
10 d g
11 d h
12 e f
13 f g
14 g h
I found the frequency of each element's occurrence considering column V1 and sorted the Freq column in ascending order
>dfFreq <- as.data.frame(table(df$V1))
Var1 Freq
1 a 3
2 b 3
3 c 3
4 d 2
5 e 1
6 f 1
7 g 1
>dfFreqSorted <- dfFreq[order(dfFreq$Freq),]
Var1 Freq
5 e 1
6 f 1
7 g 1
4 d 2
1 a 3
2 b 3
3 c 3
Now what i want to do is to create a new data.frame based on original data.frame such that each "Var1" item in "dfFreqSorted" is used according to it's Freq but once every time going from the top of "dfFreqSorted" to the bottom which would give the result below:
So consider the first Var1 item which is "e" so the first matching row of "e" in V1 column of df is (e,f) which would be the first item in the new data.frame.
I figured that this can be done using:
>subset(df, V1==dfFreqSorted$Var[1])[1,]
V1 V2
12 e f
So if i used a for loop and looped through all the elements in the Var1 column of dfFreqSorted and used the subset command above and rbind the returned result into another data.frame I would have something like below
V1 V2
12 e f
13 f g
14 g h
10 d g
1 a b
4 b c
7 c d
Now this result shows each Var1 item once. I need the remaining rows as shown below such that after finishing first iteration of all the rows of Var1 once, the loop should go again to the beginning and check the frequency of all Var1 whose frequency is more than 1 now and find the next row from df for that element so the remaining rows that should be produced in the same data.frame as shown below:
11 d h
2 a e
5 b e
8 c g
3 a f
6 b f
9 c h
As you can see above that all elements are considered in Var1 whose frequency is 1 are used first then those whose frequency is greater than 1 (i.e 2) and are used once then in the next iteration those are used whose freq is greater than 2 (i.e 3) are used. Used such that corresponding unused row of that element is fetched from df.
So in short all the elements of df are arranged in anew data.frame such that elements are used in ascending order of their frequencies but used once first and then twice or thrice in every iteration based on what their frequency is.
I am not asking for the whole code just few guidelines of how i can achieve the objective. Thanks in advance.
Hello #akrun i am a beginner so the solution might be really a beginner level approach but it solved my problem perfectly fine.
> a<-read.table("isnodes.txt")
> a
V1 V2
1 a b
2 a e
3 a f
4 b c
5 b e
6 b f
7 c d
8 c g
9 c h
10 d g
11 d h
12 e f
13 f g
14 g h
> aF<-as.data.frame(table(a$V1))
> aF
Var1 Freq
1 a 3
2 b 3
3 c 3
4 d 2
5 e 1
6 f 1
7 g 1
> aFsorted <- aF[order(aF$Freq),]
> aFsorted
Var1 Freq
5 e 1
6 f 1
7 g 1
4 d 2
1 a 3
2 b 3
3 c 3
> sortedEdgeList <- a[-c(1:nrow(a)),]
> sortedEdgeList
[1] V1 V2
<0 rows> (or 0-length row.names)
> aFsorted <- cbind(aFsorted, Used=0)
> aFsorted
Var1 Freq Used
5 e 1 0
6 f 1 0
7 g 1 0
4 d 2 0
1 a 3 0
2 b 3 0
3 c 3 0
> maxFreq <- max(aFsorted$Freq)
> maxFreq
[1] 3
> for(i in 1:maxFreq){
+ rows<-nrow(aFsorted)
+ for(j in 1:rows){
+ Var1Value<-aFsorted$Var[j]
+ Var1Edge<-a[match(aFsorted$Var1[j],a$V1),]
+ sortedEdgeList<-rbind(sortedEdgeList,Var1Edge)
+ a<-a[!(a$V1==Var1Edge$V1 & a$V2==Var1Edge$V2),]
+ aFsorted$Used[j]=aFsorted$Used[j]+1
+ }
+ if(aFsorted$Used==aFsorted$Freq){
+ aFsorted<-aFsorted[!(aFsorted$Used==aFsorted$Freq),]
+ }
+ }
Warning messages:
1: In if (aFsorted$Used == aFsorted$Freq) { :
the condition has length > 1 and only the first element will be used
2: In if (aFsorted$Used == aFsorted$Freq) { :
the condition has length > 1 and only the first element will be used
3: In if (aFsorted$Used == aFsorted$Freq) { :
the condition has length > 1 and only the first element will be used
> sortedEdgeList
V1 V2
12 e f
13 f g
14 g h
10 d g
5 a b
4 b c
7 c d
11 d h
2 a e
51 b e
8 c g
3 a f
6 b f
9 c h
I'm not sure this is what you want, but it might be close. It helps conceptually to keep the frequencies in the original data frame.
library("plyr")
set.seed(3)
df <- data.frame(V1 = sample(letters[1:10], 20, replace = TRUE),
V2 = sample(letters[1:10], 20, replace = TRUE),
stringsAsFactors = FALSE)
df$freqV1 <- NA_integer_
for (i in 1:nrow(df)) {
df$freqV1[i] <- length(grep(pattern = df$V1[i], x = df$V1))
}
df2 <- arrange(df, freqV1, V2) # you may want just arrange(df, freqV1)
which gives:
V1 V2 freqV1
1 h c 1
2 d a 2
3 d b 2
4 c c 2
5 c j 2
6 b c 3
7 g c 3
8 b f 3
9 g h 3
10 g h 3
11 b i 3
12 i a 4
13 i c 4
14 i d 4
15 i f 4
16 f b 5
17 f d 5
18 f d 5
19 f e 5
20 f f 5

Resources