Related
I have 4 dummy variables taking values 0 or 1 corresponding to the adoption or not of a certain technology. The data frame has over 14000 rows.
I want to loop over these 4 columns to give me the different combinations of == 1 into a new variable.
Data
structure(list(tech1 = structure(c(2L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), tech2 = structure(c(2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"), tech3 = structure(c(1L, 1L, 2L, 1L), .Label = c("0", "1"), class = "factor"), tech4 = structure(c(1L, 1L, 2L, 1L), .Label = c("0", "1"), class = "factor")), row.names = c(NA, 4L), class = "data.frame")
As different combinations are possible, my new variable should contain the information of which technologies each row indicates, that is, of the 4 technologies, which ones were adopted in each case.
Here is how the four first rows of the new variable could look like at the end (supposing "12" = adopted technologies 1 and 2 and so on):
Variable "Tech":
structure(list(Tech = structure(c(1L, 2L, 3L, 4L), .Label = c("12", "2", "234", "2"), class = "factor")),row.names = c(NA, 4L), class = "data.frame")
I have seen some functions that could work (e.g. aggregate), but I haven't found a solution so far.
following on from SteveM:
data.frame(tech=apply(df, 1, function(x) paste(which(x==1), collapse="")))
tech
#1 12
#2 2
#3 234
#4 2
Or a tidyverse method:
df %>%
mutate(id=row_number()) %>%
pivot_longer(tech1:tech4) %>%
filter(value==1) %>%
group_by(id) %>%
summarise(Tech=paste(gsub("tech", "", name), collapse = ""))
# A tibble: 4 x 2
# id Tech
# <int> <chr>
#1 1 12
#2 2 2
#3 3 234
#4 4 2
Without knowing what your desired end state is, with the apply function you can generate a list by row of the 1's in each column and a list by column of the 1's in each row.
m <- matrix(sample(0:1, 100, replace = TRUE), ncol = 4)
rows <- apply(m, 1, function(x) which(x == 1))
cols <- apply(m, 2, function(x) which(x == 1))
library(tidyverse)
(df <- tribble(
~dum1, ~dum2, ~dum3, ~dum4, ~value,
F, T, F, T, 12,
T, T, F, F, 20,
F, T, F, T, 32,
T, F, T, F , 27))
(
df
%>% mutate(dum1 = ifelse(dum1, "1", ""),
dum2 = ifelse(dum2, "2", ""),
dum3 = ifelse(dum3, "3", ""),
dum4 = ifelse(dum4, "4", ""),
which_tech = paste0(dum1, dum2, dum3, dum4))
)
Output:
# A tibble: 4 x 6
dum1 dum2 dum3 dum4 value which_tech
<chr> <chr> <chr> <chr> <dbl> <chr>
1 "" "2" "" "4" 12 24
2 "1" "2" "" "" 20 12
3 "" "2" "" "4" 32 24
4 "1" "" "3" "" 27 13
My input file is:
input_file <- structure(list(species = structure(1:3, .Label = c("x", "y",
"z"), class = "factor"), header1 = c(0L, 1L, 0L), header2 = c(0L,
1L, 1L), header3 = c(1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-3L))
Here 1 and 0 indicate presence and absence.
Now, I need to convert this file (based on presence - absence values) to:
output_file <- structure(list(header1 = structure(c(2L, 1L, 1L), .Label = c("",
"y"), class = "factor"), header2 = structure(c(2L, 3L, 1L), .Label = c("",
"y", "z"), class = "factor"), header3 = structure(1:3, .Label = c("x",
"y", "z"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
For this, first I try to melt my input file using reshape2:
library(reshape2)
df2 <- melt(input_file, id.var = "species")
Now I am not sure how to create a dataframe to get my desired output.
Thanks!
since you are using reshape2 you could do:
library(reshape2)
dcast(subset(df1,value>0),ave(value,variable,FUN = seq_along)~variable,value.var = "species")[-1]
header1 header2 header3
1 y y x
2 <NA> z y
3 <NA> <NA> z
You can then replace the NA with the empty string
In base R, you could do:
df1 <- subset(reshape(input_file,-1,sep="",dir="long",idvar = "species"),header>0)
reshape(transform(df1,header = ave(time,time,FUN = seq_along)),dir="wide",idvar = "header",sep="")[-1]
species1 species2 species3
y.1 y y x
z.2 <NA> z y
z.3 <NA> <NA> z
Here's a base R solution. It first does an ifelse on each row. If it finds a 1 it replaces it with the species name. If it finds a zero it writes a blank. The species column is then removed. The second line just ensures that any empty cells are moved to the bottom of the columns.
m <- t(apply(input_file, 1, function(x) ifelse(x == "1", x[1], ""))[-1,])
df <- as.data.frame(apply(m, 2, function(x) x[order(-nchar(x))]))
So we can see this matches your output file:
df
#> header1 header2 header3
#> 1 y y x
#> 2 z y
#> 3 z
identical(df, output_file)
#> [1] TRUE
I have the following data:
store location mass target
1 1 (Ams) 45 ?
2 5 (Ber) 500 ?
3 8 (Mar) 1003 ?
In this last column target I would like to have a value from the table:
location
mass range 1 5 8
0 - 350 3 4 5
> 351 6 7 8
So the target column should contain the values, 3, 7, 8 in the first three rows.
I tried to use the function INDEX() but did not work out.. If anyone knows how to do this in R or in PowerBI that would also help me. Thanks!
In R the example is reproducable by using:
structure(list(Store = 1:3, Location = structure(c(2L, 3L, 1L
), .Label = c("08-Mar", "1 Ams", "5 Ber"), class = "factor"),
Mass = c(1000L, 800L, 500L)), class = "data.frame", row.names = c(NA,
-3L))
and
structure(list(X = structure(1:2, .Label = c("0 - 350", "351 - 1000"
), class = "factor"), X1 = c(3L, 6L), X5 = c(4L, 7L), X8 = c(5L,
8L)), class = "data.frame", row.names = c(NA, -2L))
Reform your table 2 then you could use INDEX and MATCH functions as below
In R, we require a bit of pre-processing before we can actually merge the two tables since the data is not in a standard format. Assuming the two tables are called df1 and df2 respectively, we separate the data into different columns for Location in df1 and X in df2. We also add additional "X" character in df1 so that it matches the column name of df2. We bring data in long format using gather in df2 and use fuzzy_left_join to merge by number range.
library(fuzzyjoin)
library(tidyverse)
df1 %>%
separate(Location, into = c("Loc1", "Loc2"), sep = "\\s+|-", convert = TRUE) %>%
mutate(Loc1 = paste0("X", Loc1)) %>%
fuzzy_left_join(df2 %>%
separate(X, into = c("start", "end"), convert = TRUE) %>%
gather(key, Target, starts_with("X")),
by = c("Loc1" = "key", "Mass" = "start", "Mass" = "end"),
match_fun = list(`==`, `>=`, `<=`))
# Store Loc1 Loc2 Mass start end key Target
#1 1 X1 Ams 1000 351 1000 X1 6
#2 2 X5 Ber 800 351 1000 X5 7
#3 3 X8 Mar 45 0 350 X8 5
data
df1 <- structure(list(Store = 1:3, Location = structure(c(2L, 3L, 1L
), .Label = c("08-Mar", "1 Ams", "5 Ber"), class = "factor"),
Mass = c(1000, 800, 45)), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(X = structure(1:2, .Label = c("0 - 350", "351 - 1000"
), class = "factor"), X1 = c(3L, 6L), X5 = c(4L, 7L), X8 = c(5L,
8L)), class = "data.frame", row.names = c(NA, -2L))
I need to merge two tables in R.
The table X looks this way:
company_name country_code country cost1 cost2
1 Test1 FR <NA> NA 9.945000e-02
2 Test1 BR Brazil NA NA
3 Test2 <NA> USA 1 1.053000e-01
The table Y looks this way:
country country_code tier
France FR 1
Brazil BR 2
USA US 1
I need to merge X and Y to get Z:
name country_code tier
Test1 FR 1
Test2 BR 2
....
What should I do to merge by OR condition or something?
The following will do it. Note that I use a function from package zoo, so you will need to have it installed.
m <- merge(df1, df2, all = TRUE)
m$country <- zoo::na.locf(m$country)
m <- lapply(split(m, m$country), function(.m) zoo::na.locf(.m, fromLast = TRUE))
m <- lapply(m, function(.m) zoo::na.locf(.m))
m <- do.call(rbind, m)
m <- m[!duplicated(m), c(3, 2, 4)]
row.names(m) <- NULL
m
# name country_code tier
#1 First FR 1
#2 Third US 1
#3 Second BR 2
DATA.
df1 <-
structure(list(name = structure(1:3, .Label = c("First", "Second",
"Third"), class = "factor"), country = structure(c(1L, NA, 2L
), .Label = c("France", "USA"), class = "factor"), country_code = structure(c(NA,
1L, 2L), .Label = c("BR", "US"), class = "factor")), .Names = c("name",
"country", "country_code"), class = "data.frame", row.names = c(NA,
-3L))
df2 <-
structure(list(country = structure(c(2L, 1L, 3L), .Label = c("Brazil",
"France", "USA"), class = "factor"), country_code = structure(c(2L,
1L, 3L), .Label = c("BR", "FR", "US"), class = "factor"), tier = c(1L,
2L, 1L)), .Names = c("country", "country_code", "tier"), class = "data.frame", row.names = c(NA,
-3L))
EDIT.
After the comments and the question edit by the OP, the input data has changed and the following code and new df1 reflect that change.
fun <- function(DF, col){
sp <- split(DF, DF[[col]])
m <- lapply(sp, function(.m) zoo::na.locf(.m, fromLast = TRUE))
m <- lapply(m, function(.m) zoo::na.locf(.m))
m <- do.call(rbind, m)
row.names(m) <- NULL
m
}
m <- merge(df1, df2, all = TRUE)
m$country <- zoo::na.locf(m$country)
m$country_code <- zoo::na.locf(m$country_code)
m <- fun(m, "country_code")
m <- m[!duplicated(m), ]
m
# country_code country company_name cost1 cost2 tier
#1 BR Brazil Test <NA> 0.0819 2
#2 FR France Test <NA> 0.09945 1
#4 US USA Test <NA> 0.1053 1
df1 <-
structure(list(company_name = structure(c(1L, 1L, 1L), .Label = "Test", class = "factor"),
country_code = structure(c(2L, 1L, NA), .Label = c("BR",
"FR"), class = "factor"), country = structure(c(NA, 1L, 2L
), .Label = c("Brazil", "USA"), class = "factor"), cost1 = c(NA,
NA, NA), cost2 = c(0.09945, 0.0819, 0.1053)), .Names = c("company_name",
"country_code", "country", "cost1", "cost2"), class = "data.frame", row.names = c("1",
"2", "3"))
I would like to compare two data sets and identify specific instances of discrepancies between them (i.e., which variables were different).
While I have found out how to identify which records are not identical between the two data sets (using the function detailed here: http://www.cookbook-r.com/Manipulating_data/Comparing_data_frames/), I'm not sure how to flag which variables are different.
E.g.
Data set A:
id name dob vaccinedate vaccinename dose
100000 John Doe 1/1/2000 5/20/2012 MMR 4
100001 Jane Doe 7/3/2011 3/14/2013 VARICELLA 1
Data set B:
id name dob vaccinedate vaccinename dose
100000 John Doe 1/1/2000 5/20/2012 MMR 3
100001 Jane Doee 7/3/2011 3/24/2013 VARICELLA 1
100002 John Smith 2/5/2010 7/13/2013 HEPB 3
I want to identify which records are different, and which specific variable(s) have discrepancies. For example, the John Doe record has 1 discrepancy in dose, and the Jane Doe record has 2 discrepancies: in name and vaccinedate. Also, data set B has one additional record that was not in data set A, and I would want to identify these instances as well.
In the end, the goal is to find the frequency of the "types" of errors, e.g. how many records have a discrepancy in vaccinedate, vaccinename, dose, etc.
Thanks!
This should get you started, but there may be more elegant solutions.
First, establish df1 and df2 so others can reproduce quickly:
df1 <- structure(list(id = 100000:100001, name = structure(c(2L, 1L), .Label = c("Jane Doe","John Doe"), class = "factor"), dob = structure(1:2, .Label = c("1/1/2000", "7/3/2011"), class = "factor"), vaccinedate = structure(c(2L, 1L), .Label = c("3/14/2013", "5/20/2012"), class = "factor"), vaccinename = structure(1:2, .Label = c("MMR", "VARICELLA"), class = "factor"), dose = c(4L, 1L)), .Names = c("id", "name", "dob", "vaccinedate", "vaccinename", "dose"), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(id = 100000:100002, name = structure(c(2L, 1L, 3L), .Label = c("Jane Doee", "John Doe", "John Smith"), class = "factor"), dob = structure(c(1L, 3L, 2L), .Label = c("1/1/2000", "2/5/2010", "7/3/2011"), class = "factor"), vaccinedate = structure(c(2L, 1L, 3L), .Label = c("3/24/2013", "5/20/2012", "7/13/2013"), class = "factor"), vaccinename = structure(c(2L, 3L, 1L), .Label = c("HEPB", "MMR", "VARICELLA"), class = "factor"), dose = c(3L, 1L, 3L)), .Names = c("id", "name", "dob", "vaccinedate", "vaccinename", "dose"), class = "data.frame", row.names = c(NA, -3L))
Next, get the discrepancies from df1 to df2 via mapply and setdiff. That is, what's in set one that's not in set two:
discrep <- mapply(setdiff, df1, df2)
discrep
# $id
# integer(0)
#
# $name
# [1] "Jane Doe"
#
# $dob
# character(0)
#
# $vaccinedate
# [1] "3/14/2013"
#
# $vaccinename
# character(0)
#
# $dose
# [1] 4
To count them up we can use sapply:
num.discrep <- sapply(discrep, length)
num.discrep
# id name dob vaccinedate vaccinename dose
# 0 1 0 1 0 1
Per your question on obtaining id's in set two that are not in set one, you could reverse the process with mapply(setdiff, df2, df1) or if it's simply an exercise of ids only you could do setdiff(df2$id, df1$id).
For more on R's functional functions (e.g., mapply, sapply, lapply, etc.) see this post.
Updating with a purrr solution:
map2(df1, df2, setdiff) %>%
map_int(length)
One possibility. First, find out which ids both datasets have in common. The simplest way to do this is:
commonID<-intersect(A$id,B$id)
Then you can determine which rows are missing from A by:
> B[!B$id %in% commonID,]
# id name dob vaccinedate vaccinename dose
# 3 100002 John Smith 2/5/2010 7/13/2013 HEPB 3
Next, you can restrict both datasets to the ids they have in common.
Acommon<-A[A$id %in% commonID,]
Bcommon<-B[B$id %in% commonID,]
If you can't assume that the id's are in the right order, then sort them both:
Acommon<-Acommon[order(Acommon$id),]
Bcommon<-Bcommon[order(Bcommon$id),]
Now you can see what fields are different like this.
diffs<-Acommon != Bcommon
diffs
# id name dob vaccinedate vaccinename dose
# 1 FALSE FALSE FALSE FALSE FALSE TRUE
# 2 FALSE TRUE FALSE TRUE FALSE FALSE
This is a logical matrix, and you can do whatever you want with it. For example, to find the total number of errors in each column:
colSums(diffs)
# id name dob vaccinedate vaccinename dose
# 0 1 0 1 0 1
To find all ids where the name is different:
Acommon$id[diffs[,"name"]]
# [1] 100001
And so on.
There is a new package call waldo
install.packages("waldo")
library(waldo)
# construct the data frames
df1 <- structure(list(id = 100000:100001, name = structure(c(2L, 1L), .Label = c("Jane Doe","John Doe"), class = "factor"), dob = structure(1:2, .Label = c("1/1/2000", "7/3/2011"), class = "factor"), vaccinedate = structure(c(2L, 1L), .Label = c("3/14/2013", "5/20/2012"), class = "factor"), vaccinename = structure(1:2, .Label = c("MMR", "VARICELLA"), class = "factor"), dose = c(4L, 1L)), .Names = c("id", "name", "dob", "vaccinedate", "vaccinename", "dose"), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(id = 100000:100002, name = structure(c(2L, 1L, 3L), .Label = c("Jane Doee", "John Doe", "John Smith"), class = "factor"), dob = structure(c(1L, 3L, 2L), .Label = c("1/1/2000", "2/5/2010", "7/3/2011"), class = "factor"), vaccinedate = structure(c(2L, 1L, 3L), .Label = c("3/24/2013", "5/20/2012", "7/13/2013"), class = "factor"), vaccinename = structure(c(2L, 3L, 1L), .Label = c("HEPB", "MMR", "VARICELLA"), class = "factor"), dose = c(3L, 1L, 3L)), .Names = c("id", "name", "dob", "vaccinedate", "vaccinename", "dose"), class = "data.frame", row.names = c(NA, -3L))
# compare them
compare(df1,df2)
And we get:
`old` is length 2
`new` is length 3
`names(old)`: "X" "Y"
`names(new)`: "X" "Y" "Z"
`attr(old, 'row.names')`: 1 2 3
`attr(new, 'row.names')`: 1 2 3 4
`old$X`: 1 2 3
`new$X`: 1 2 3 4
`old$Y`: "a" "b" "c"
`new$Y`: "A" "b" "c" "d"
`old$Z` is absent
`new$Z` is a character vector ('k', 'l', 'm', 'n')
library(compareDF)
compare_df(dataframe1, dataframe2, c("columnname"))