I have a dataframe:
ID SUB_ID Action
1 A Open
1 A Download
1 A Close
1 B Open
1 B Search
1 B Download
1 B Close
2 AA Open
2 AA Download
2 AA Close
2 BB Open
2 BB Search
2 BB Filter
2 BB Close
3 C Open
3 C Search
3 C Filter
3 C Close
I want to get table with ID and number of SUB_ID per ID and number of "Download" in column Action within one SUB_ID. So, desired result is:
ID SUB_ID_n Download_n
1 2 2
2 2 1
3 1 0
How could i do that?
Count number of unique values using n_distinct and sum over logical values to calculate rows with Action == 'Download'.
library(dplyr)
df %>%
group_by(ID) %>%
summarise(SUB_ID_n = n_distinct(SUB_ID, na.rm = TRUE),
Download_n = sum(Action == 'Download'))
# ID SUB_ID_n Download_n
# <int> <int> <int>
#1 1 2 2
#2 2 2 1
#3 3 1 0
In data.table this can be written as :
library(data.table)
setDT(df)[, .(SUB_ID_n = uniqueN(SUB_ID, na.rm = TRUE),
Download_n = sum(Action == 'Download')), ID]
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), SUB_ID = c("A", "A", "A", "B",
"B", "B", "B", "AA", "AA", "AA", "BB", "BB", "BB", "BB", "C",
"C", "C", "C"), Action = c("Open", "Download", "Close", "Open",
"Search", "Download", "Close", "Open", "Download", "Close", "Open",
"Search", "Filter", "Close", "Open", "Search", "Filter", "Close"
)), class = "data.frame", row.names = c(NA, -18L))
Related
I would like to join repeatedly between two tables. Here is the table.
structure(list(key = structure(1:4, .Label = c("A", "B", "C", "D"),
class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
structure(list(key = structure(c(1L, 2L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor"), source = structure(c(1L, 1L, 2L, 2L), .Label = c("a", "b"), class = "factor"), value = c(1L, 1L, 2L, 2L)), class = "data.frame", row.names = c(NA, -4L))
<joined>
key
A
B
C
D
<joining>
key source value
A a 1
B a 1
B b 2
C b 2
If I use left_join function like left_join(joined, joining, by = "key"), the results is here.
key source value
1 A a 1
2 B a 1
3 B b 2
4 C b 2
5 D <NA> NA
However, I want to join grouping by "source". My expected results are here.
joining_a <- joining %>%
filter(source == "a")
joining_b <- joining %>%
filter(source == "b")
left_join(joined, joining_a, by = "key")
left_join(joined, joining_b, by = "key")
bind_rows(left_join(joined, joining_a, by = "key"), left_join(joined, joining_b, by = "key"))
key source value
1 A a 1
2 B a 1
3 C <NA> NA
4 D <NA> NA
5 A <NA> NA
6 B b 2
7 C b 2
8 D <NA> NA
How do I join the tables not dividing these tables?
We can group_split(or split from base R) the 'joining' into a list and then do the left_join with 'joined' using map
library(tidyverse)
joining %>%
group_split(source) %>%
map_dfr(~ left_join(joined, .x, by = 'key'))
# key source value
#1 A a 1
#2 B a 1
#3 C <NA> NA
#4 D <NA> NA
#5 A <NA> NA
#6 B b 2
#7 C b 2
#8 D <NA> NA
Or without a lambda function
joining %>%
group_split(source) %>%
map_dfr(left_join, x = joined, by = 'key')
data
joined <- structure(list(key = structure(1:4, .Label = c("A", "B", "C",
"D"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
joining <- structure(list(key = structure(c(1L, 2L, 2L, 3L),
.Label = c("A",
"B", "C"), class = "factor"), source = structure(c(1L, 1L, 2L,
2L), .Label = c("a", "b"), class = "factor"), value = c(1L, 1L,
2L, 2L)), class = "data.frame", row.names = c(NA, -4L))
Greeting,
I would need to prepare data for network analysis in Gephi. I have data in the following format:
MY Data
And I need data in format (Where the values represent persons that are connected through the organization):
Required format
Thank you very much!
I think this code should do the job. It is not the best most elegant way of doing it, but it works :)
# Data
x <-
structure(
list(
Persons = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L),
Organizations = c("A", "B", "E", "F", "A", "E", "C", "D", "C", "A", "E")
),
.Names = c("Persons", "Organizations"),
class = "data.frame",
row.names = c(NA, -11L)
)
# This will merge n:n
edgelist <- merge(x, x, by = "Organizations")[,2:3]
# We don't want autolinks
edgelist <- subset(edgelist, Persons.x != Persons.y)
# Removing those that are repeated
edgelist <- unique(edgelist)
edgelist
#> Persons.x Persons.y
#> 2 1 3
#> 3 1 2
#> 4 3 1
#> 6 3 2
#> 7 2 1
#> 8 2 3
HIH
Created on 2018-01-03 by the reprex
package (v0.1.1.9000).
Starting with x:
structure(list(Persons = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), Organizations = c("A", "B", "E", "F", "A", "E", "C", "D", "C", "A", "E")), .Names = c("Persons", "Organizations"), class = "data.frame", row.names = c(NA,-11L))
Create a new data.frame with different names. Just convert Organizations to a factor and then use the numeric values:
> y=data.frame(Source=x$Persons, Target=as.numeric(as.factor(x$Organizations)))
> y
Source Target
1 1 1
2 1 2
3 1 5
4 2 6
5 2 1
6 2 5
7 2 3
8 3 4
9 3 3
10 3 1
11 3 5
For what it's worth, I'm pretty sure gephi can handle strings.
I have five columns with 2 levels and their column names are like c(a,b,x,y,z). The command below works for 1 column at time. But I need to it for all five columns at the same time.
levels(car_data[,"x"]) <- c(0,1)
car_data[,"x"] <- as.numeric(levels(car_data[,"x"]))[car_data[,"x"]]
If there are two levels, then we can do
library(dplyr)
car_data %>%
mutate_all(funs(as.integer(.)-1))
# a b c
#1 0 0 0
#2 1 1 1
#3 0 0 0
#4 1 1 1
data
car_data <- structure(list(a = structure(c(1L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor"), b = structure(c(1L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor"), c = structure(c(1L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor")), .Names = c("a", "b", "c"), row.names = c(NA,
-4L), class = "data.frame")
I am trying to read in the following .csv file into R. As you can see from the imagine below, Row 2 has the unique variable names, while Row 3 has the values for the above variables. So Rows 2/3 together represent one observation. This process continues, so that Row 4 is a line of variable names and Row 5 corresponds to those variable values. This process continues so that each two-row pair (2/3, 4/5, 6/7....999/1000) represent one observation. There are 1,000 observations total in the data set.
What I am having trouble with is reading this into R so that I have a more usable dataset. My goal is to have a standard set of variable names across the top row, and each subsequent line representing one observation.
Any expert R coders have suggestions?
Thank you,
Here is a solution that worked on a simple test case I made. You'd need to import your data into a data.frame, x = read.csv(file="your-file.csv")
To test this though, I used the test data.frame, x:
x=structure(list(V1 = structure(c(2L, 1L, 4L, 3L), .Label = c("1",
"a", "ab", "h"), class = "factor"), V2 = structure(c(2L, 1L,
4L, 3L), .Label = c("2", "b", "cd", "i"), class = "factor"),
V3 = structure(c(3L, 1L, 2L, 4L), .Label = c("3", "a", "c",
"ef"), class = "factor"), V4 = structure(c(3L, 1L, 2L, 4L
), .Label = c("4", "b", "d", "gh"), class = "factor"), V5 = structure(c(3L,
1L, 2L, 4L), .Label = c("5", "c", "e", "ij"), class = "factor"),
V6 = structure(c(3L, 1L, 2L, 4L), .Label = c("6", "d", "f",
"kl"), class = "factor"), V7 = structure(c(3L, 1L, 2L, 4L
), .Label = c("7", "e", "g", "mno"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7"), class = "data.frame", row.names = c(NA,
-4L))
Which turns this table (rows 1 and 3 are your labels):
V1 V2 V3 V4 V5 V6 V7
1 a b c d e f g
2 1 2 3 4 5 6 7
3 h i a b c d e
4 ab cd ef gh ij kl mno
Using this script to generate a final data.frame dat:
library(plyr)
variables = x[seq(1,nrow(x),2),] #df of all variable rows
values = x[seq(2,nrow(x),2),] #df of all value rows
dat=data.frame() #generate blank data.frame
for(i in 1:nrow(variables)) {
dat.temp=data.frame(values[i,])#make temporary df for the row i of your values
colnames(dat.temp)=as.matrix(variables[i,]) # name the temporary df from row i of your variables
print(dat.temp) #check that they are coming out right (comment this out as necessary)
dat=rbind.fill(dat,dat.temp) #create the final data.frame
rm(dat.temp) #remove the temporary df
}
Into this final table (variables are the column names now):
a b c d e f g h i
1 1 2 3 4 5 6 7 <NA> <NA>
2 ef gh ij kl mno <NA> <NA> ab cd
Hope it works.
I have a table made up of three columns. A person identifier, a column of event type (A or B) and a column of dates when the event happened.
This is shown here:
Person Event EventDate
1 A 22/03/15
1 A 22/05/15
1 B 12/12/15
1 B 12/12/15
2 B 01/13/12
2 B 02/03/12
2 B 03/08/14
2 A 05/09/14
3 B 02/02/02
3 A 03/05/14
3 B 03/06/14
3 A 17/11/15
3 A 17/02/16
3 A 18/05/16
3 A 23/06/16
I want to subset the data. The subset should capture all eventA within a Person that occur after the first eventB The output would be:
Person Event EventDate
2 A 05/09/14
3 A 03/05/14
3 A 17/11/15
3 A 17/02/16
3 A 18/05/16
3 A 23/06/16
I think the problem I have is knowing how to compare rows for a Person based on two column comparison (Event and EventDate).
This is the dput of the original data above
structure(list(Person..Event...EventDate = c("1 A 22/03/15",
"1 A 22/05/15", "1 B 12/12/15", "1 B 12/12/15", "2 B 01/13/12",
"2 B 02/03/12", "2 B 03/08/14", "2 A 05/09/14", "3 B 02/02/02",
"3 A 03/05/14", "3 B 03/06/14", "3 A 17/11/15", "3 A 17/02/16",
"3 A 18/05/16", "3 A 23/06/16")), .Names = "Person..Event...EventDate", class = "data.frame", row.names = c(NA,
-15L))
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)). Grouped by 'Person', we order the 'Person', 'EventDate' (after converting to Date class), then if the cumulative sum of 'B' 'Event' is 1 for the first elements, then get the row index of 'A' 'Event' and use that to subset the original dataset.
library(data.table)
setDT(df1)[df1[order(Person, as.Date(EventDate, '%d/%m/%y')),
if(cumsum(Event == "B")[1]==1) .I[Event == "A"], by = Person]$V1]
# Person Event EventDate
#1: 2 A 05/09/14
#2: 3 A 03/05/14
#3: 3 A 17/11/15
#4: 3 A 17/02/16
#5: 3 A 18/05/16
#6: 3 A 23/06/16
Or we can use dplyr
library(dplyr)
df1 %>%
arrange(Person, as.Date(EventDate, '%d/%m/%y')) %>%
group_by(Person) %>%
filter(first(Event == "B") & Event == "A")
# Person Event EventDate
# <int> <chr> <chr>
#1 2 A 05/09/14
#2 3 A 03/05/14
#3 3 A 17/11/15
#4 3 A 17/02/16
#5 3 A 18/05/16
#6 3 A 23/06/16
data
df1 <- structure(list(Person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), Event = c("A", "A", "B", "B", "B", "B",
"B", "A", "B", "A", "B", "A", "A", "A", "A"), EventDate = c("22/03/15",
"22/05/15", "12/12/15", "12/12/15", "01/13/12", "02/03/12", "03/08/14",
"05/09/14", "02/02/02", "03/05/14", "03/06/14", "17/11/15", "17/02/16",
"18/05/16", "23/06/16")), .Names = c("Person", "Event", "EventDate"
), class = "data.frame", row.names = c(NA, -15L))
This can be done using sqldf. I'm assuming the data is sorted by date.
> library(sqldf)
>
v1 <- structure(list(Person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), Event = c("A", "A", "B", "B", "B", "B", "B", "A", "B", "A",
"B", "A", "A", "A", "A"), EventDate = c("22/03/15", "22/05/15", "12/12/15",
"12/12/15", "01/10/12", "02/03/12", "03/08/14", "05/09/14", "02/02/02",
"03/05/14", "03/06/14", "17/11/15", "17/02/16", "18/05/16", "23/06/16")), .Names
= c("Person", "Event", "EventDate"), class = "data.frame", row.names = c(NA,
-15L))
> v1$EventDate <- as.Date(v1$EventDate , '%d/%m/%y')
> v2 <- v1[v1$Event == 'B' , ]
> v2 <- v2[ !duplicated(v2$Person) , ]
> v3 <- v1[v1$Event == 'A' , ]
> sqldf("select a.* from v3 a , v2 b where a.EventDate > b.EventDate And a.Person = b.Person")
Person Event EventDate
1 2 A 2014-09-05
2 3 A 2014-05-03
3 3 A 2015-11-17
4 3 A 2016-02-17
5 3 A 2016-05-18
6 3 A 2016-06-23
> v1
Person Event EventDate
1 1 A 2015-03-22
2 1 A 2015-05-22
3 1 B 2015-12-12
4 1 B 2015-12-12
5 2 B 2012-10-01
6 2 B 2012-03-02
7 2 B 2014-08-03
8 2 A 2014-09-05
9 3 B 2002-02-02
10 3 A 2014-05-03
11 3 B 2014-06-03
12 3 A 2015-11-17
13 3 A 2016-02-17
14 3 A 2016-05-18
15 3 A 2016-06-23