Now I have a dataframe and two lists, each list contains some of the dataframe colnames, I need to add new columns that contain ranking for each column within the dataframe the issue is I have to rank the columns in the Deslist1 descending way and the columns with colnames matching with Asclist1 in ascending way and accordingly the final needed output to be like
i tried the any(and within but it doesnt work tr<-any(Asclist1 %in% DF1
again my problem isn't how to add new columns containing rank my problem is how to rank based on lists elements as you can see the Asclist1 contains elements doesn't exists within the DF1 columns
DF1 <- data.frame("name" = c("john", "adam", "leo", "lena", "Di"),
"sex" = c("m", "m", "m", "f", "f"),
"age" = c(99, 46, 23, 54, 23),
"grade" = c(96, 46, 63, 54, 23),
"income" = c(59, 36, 93, 34, 23),
"score" = c(99, 46, 23, 54, 23))
print(DF1)
Asclist1<-list("score","income","spending")
Asclist1
Deslist2<-list("age","grade")
Deslist2
update----
code1
library(readr)
library(tidyr)
library(purrr)
library(rlang)
library(glue)
library(dplyr)
library(miscTools)
library(matrixStats)
library(shiny)
library(reshape2)
library(dplyr)
hotdrinks<-list("tea","green tea")
juices<-list("orange","mango")
energydrinks<-list("powerhorse","redbull")
drinks<-list("hotdrinks"=hotdrinks,"juices"=juices,"energydrinks"=energydrinks)
biscuits<-list("loacker","tuc")
choc<-list("aftereight","lindt")
gum<-list("trident","clortes")
sweets<-list("gum"=gum,"biscuits"=biscuits,"choc"=choc)
all_products<-list("sweets"=sweets,"drinks"=drinks)
mt<-melt(all_products)
mt2<-mt%>%mutate("Price"=c(23,34,23,23,54,32,45,23,12,56,76,43),
"Quantity"=c(10,20,26,22,51,52,45,23,12,56,76,43),
"amount"=c(23,34,23,23,54,32,45,23,12,56,76,43))
t1<-mt2[,c(4,3,1,5,6,7)]
t1
colnames(t1)<-c("CAT","PN","SP","Quantity","Price","amount")
t2<-list(unique(t1$CAT))
t2
QL<-c("Quantity","Price")
QD<-c("Quantity","amount")
QS<-c("amount","Price")
all <- list("drinks"=drinks, "sweets"=sweets)
FCX<-data.frame("sbo"=c("w","q","a"),
"Quantity_fcx"=c(3,2,5),
"Price_fcx"=c(7,8,5),
"amount_fcx"=c(4,7,3)
)
#DF1<-Y
DF1 <- t1
DF1
#print(DF1)
DFCXL<-list(colnames(DF1[-c(1:3)]))
DFCXL
DFCX1<-lapply(DFCXL, paste0, "_fcx")
DFCX1
DFCXM<-colMeans(FCX[,unlist(DFCX1)],na.rm = FALSE)
DFCXM
DFCXMd<-colMedians(data.matrix(FCX[,unlist(DFCX1)]),na.rm = FALSE )
DFCXMddf<-as.data.frame(t(DFCXMd))
DFCXMddf
DFCX1l<-as.list(DFCX1)
colnames(DFCXMddf)<-unlist(DFCX1l)
DFCXMddf
#median repeated tibble
rDFCXMddf<-DFCXMddf[rep(seq_len(nrow(DFCXMddf)), each = nrow(DF1)), ]
rDFCXMddf
DFCX<-data.frame(t(DFCXM))
DFL<-as.vector(colnames(DF1))
DFL
DFCX
#mean repeated tibble
rDFCX<-DFCX[rep(seq_len(nrow(DFCX)), each = nrow(DF1)), ]
#rDFCX
#ascending rank form smallest to largest where the smallest is the most competitive
Asclist1<-list("Quantity","Price")
#Asclist1
#descending rank form largest to smallest where the largest is the most competitive
Deslist2<-list("xyz","amount")
#Deslist2
#DF3 contains orginal dataframe with rank for each column descending & ascending
DF3<-
DF1 %>% mutate_if(grepl(paste(Deslist2, collapse = "|"), names(.)), list(rank=~rank(-.))) %>%
mutate_if(grepl(paste(Asclist1, collapse = "|"), names(.)), list(rank=~rank( .)))
DF3
#DF4 contains only determinants columns
DF4<-DF3%>%select(-one_of(DFL))
DF4
#DF5 contains all deterements with their ranks columns
DF5<-cbind(rDFCX,DF4)
DF5
#getting final rank for each column based on multiplying CX columns "weight" * normal rank to get weighted ranking
dynamic_mutate = function(DF5,
col_names = gsub("(.*)_\\w+$", "\\1", names(DF5)),
expression = "({x}_rank*{x}_fcx)",
prefix = "FINAL"){
name_list = col_names %>% unique() %>% as.list()
expr_list = name_list %>% lapply(function(x) parse_quosure(glue(expression))) %>%
setNames(paste(prefix, name_list, sep = "_"))
DF5 %>% mutate(!!!expr_list)}
DF6<-DF5 %>% dynamic_mutate()
#DF6
#getting mean for ranks
DFL2<-as.vector(colnames(DF5))
DF7<-DF6%>%select(-one_of(DFL2))
#DF7
#final limit ranking
DF8<-mutate(DF7,fnl_scr=rowMeans(DF7))
#DF8
#final rank
Ranking<-rank(DF8$fnl_scr)
#Ranking
#final dataframe
DF9<-as_tibble(cbind(DF1,Ranking))
DF9
code 2
library(readr)
library(tidyr)
library(purrr)
library(rlang)
library(glue)
library(dplyr)
library(miscTools)
library(matrixStats)
library(shiny)
library(reshape2)
library(dplyr)
hotdrinks<-list("tea","green tea")
juices<-list("orange","mango")
energydrinks<-list("powerhorse","redbull")
drinks<-list("hotdrinks"=hotdrinks,"juices"=juices,"energydrinks"=energydrinks)
biscuits<-list("loacker","tuc")
choc<-list("aftereight","lindt")
gum<-list("trident","clortes")
sweets<-list("gum"=gum,"biscuits"=biscuits,"choc"=choc)
all_products<-list("sweets"=sweets,"drinks"=drinks)
mt<-melt(all_products)
mt2<-mt%>%mutate("Price"=c(23,34,23,23,54,32,45,23,12,56,76,43),
"Quantity"=c(10,20,26,22,51,52,45,23,12,56,76,43),
"amount"=c(23,34,23,23,54,32,45,23,12,56,76,43))
t1<-mt2[,c(4,3,1,5,6,7)]
t1
colnames(t1)<-c("CAT","PN","SP","Quantity","Price","amount")
t2<-list(unique(t1$CAT))
t2
QL<-c("Quantity","Price")
QD<-c("Quantity","amount")
QS<-c("amount","Price")
all <- list("drinks"=drinks, "sweets"=sweets)
FCX<-data.frame("sbo"=c("w","q","a"),
"Quantity_fcx"=c(3,2,5),
"Price_fcx"=c(7,8,5),
"amount_fcx"=c(4,7,3)
)
#DF1<-Y
DF1 <- t1
DF1
#print(DF1)
DFCXL<-list(colnames(DF1[-c(1:3)]))
DFCXL
DFCX1<-lapply(DFCXL, paste0, "_fcx")
DFCX1
DFCXM<-colMeans(FCX[,unlist(DFCX1)],na.rm = FALSE)
DFCXM
DFCXMd<-colMedians(data.matrix(FCX[,unlist(DFCX1)]),na.rm = FALSE )
DFCXMddf<-as.data.frame(t(DFCXMd))
DFCXMddf
DFCX1l<-as.list(DFCX1)
colnames(DFCXMddf)<-unlist(DFCX1l)
DFCXMddf
#median repeated tibble
rDFCXMddf<-DFCXMddf[rep(seq_len(nrow(DFCXMddf)), each = nrow(DF1)), ]
rDFCXMddf
DFCX<-data.frame(t(DFCXM))
DFL<-as.vector(colnames(DF1))
DFL
DFCX
#mean repeated tibble
rDFCX<-DFCX[rep(seq_len(nrow(DFCX)), each = nrow(DF1)), ]
#rDFCX
#ascending rank form smallest to largest where the smallest is the most competitive
Asclist1<-list("Quantity","Price","amount")
#Asclist1
#descending rank form largest to smallest where the largest is the most competitive
Deslist2<-list("xyz")
#Deslist2
#DF3 contains orginal dataframe with rank for each column descending & ascending
DF3<-
DF1 %>% mutate_if(grepl(paste(Deslist2, collapse = "|"), names(.)), list(rank=~rank(-.))) %>%
mutate_if(grepl(paste(Asclist1, collapse = "|"), names(.)), list(rank=~rank( .)))
DF3
#DF4 contains only determinants columns
DF4<-DF3%>%select(-one_of(DFL))
DF4
#DF5 contains all deterements with their ranks columns
DF5<-cbind(rDFCX,DF4)
DF5
#getting final rank for each column based on multiplying CX columns "weight" * normal rank to get weighted ranking
dynamic_mutate = function(DF5,
col_names = gsub("(.*)_\\w+$", "\\1", names(DF5)),
expression = "({x}_rank*{x}_fcx)",
prefix = "FINAL"){
name_list = col_names %>% unique() %>% as.list()
expr_list = name_list %>% lapply(function(x) parse_quosure(glue(expression))) %>%
setNames(paste(prefix, name_list, sep = "_"))
DF5 %>% mutate(!!!expr_list)}
DF6<-DF5 %>% dynamic_mutate()
#DF6
#getting mean for ranks
DFL2<-as.vector(colnames(DF5))
DF7<-DF6%>%select(-one_of(DFL2))
#DF7
#final limit ranking
DF8<-mutate(DF7,fnl_scr=rowMeans(DF7))
#DF8
#final rank
Ranking<-rank(DF8$fnl_scr)
#Ranking
#final dataframe
DF9<-as_tibble(cbind(DF1,Ranking))
DF9
Another option is to use map to do this simultaneously by creating a column of 1, -1s
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
tibble(col1 = list(Asclist1, Deslist2), col2 = c(1, -1)) %>%
unnest_longer(col1) %>%
group_split(col2) %>%
map_dfc(~ DF1 %>%
mutate(tmp = first(.x$col2)) %>%
select(one_of(.x$col1), tmp) %>%
transmute_at(vars(-tmp), list(rank = ~rank(tmp * .)))) %>%
bind_cols(DF1, .)
# name sex age grade income score age_rank grade_rank income_rank score_rank
#1 john m 99 96 59 99 1.0 1 4 5.0
#2 adam m 46 46 36 46 3.0 4 3 3.0
#3 leo m 23 63 93 23 4.5 2 5 1.5
#4 lena f 54 54 34 54 2.0 3 2 4.0
#5 Di f 23 23 23 23 4.5 5 1 1.5
#Warning message:
#Unknown columns: `spending`
It would also notify the unknown columns as a warning
Update
If there is a single column with transmute_at, it would not add the name in list as suffix. To bypass that, we can create a function with rename_if
f1 <- function(dat) {
nm1 <- setdiff(names(dat), "tmp")
n1 <- length(nm1)
dat %>%
transmute_at(vars(-tmp), list(rank = ~rank(tmp * .))) %>%
rename_if(rep(n1 == 1, n1), ~ str_c(nm1, "_", .))
}
tibble(col1 = list(Asclist1, Deslist2), col2 = c(1, -1)) %>%
unnest_longer(col1) %>%
group_split(col2) %>%
map_dfc(~ DF1 %>%
mutate(tmp = first(.x$col2)) %>%
select(one_of(.x$col1), tmp) %>%
f1(.)) %>%
bind_cols(DF1, .)
# CAT PN SP Quantity Price amount amount_rank Quantity_rank Price_rank
# 1 sweets gum trident 23 10 23 9.5 3.5 1
# 2 sweets gum clortes 34 20 34 6.0 7.0 3
# 3 sweets biscuits loacker 23 26 23 9.5 3.5 6
# 4 sweets biscuits tuc 23 22 23 9.5 3.5 4
# 5 sweets choc aftereight 54 51 54 3.0 10.0 9
# 6 sweets choc lindt 32 52 32 7.0 6.0 10
# 7 drinks hotdrinks tea 45 45 45 4.0 9.0 8
# 8 drinks hotdrinks green tea 23 23 23 9.5 3.5 5
# 9 drinks juices orange 12 12 12 12.0 1.0 2
# 10 drinks juices mango 56 56 56 2.0 11.0 11
# 11 drinks energydrinks powerhorse 76 76 76 1.0 12.0 12
# 12 drinks energydrinks redbull 43 43 43 5.0 8.0 7
We can apply rank using mutate_if and grepl
library(dplyr)
DF1 %>%
mutate_if(grepl(paste(Asclist1, collapse = "|"), names(.)), list(rank=~rank( .))) %>%
mutate_if(grepl(paste(Deslist2, collapse = "|"), names(.)), list(rank=~rank(-.)))
name sex age grade income score age_rank grade_rank income_rank score_rank
1 john m 99 96 59 99 1.0 1 4 5.0
2 adam m 46 46 36 46 3.0 4 3 3.0
3 leo m 23 63 93 23 4.5 2 5 1.5
4 lena f 54 54 34 54 2.0 3 2 4.0
5 Di f 23 23 23 23 4.5 5 1 1.5
I need to reorganize my dataframe so that I can run Krippendorff's alpha. What function/rudimentary solution can I find?
Here's what my dataframe looks like:
That is, each participant has 7 rows (for 7 observations). Each observation was assessed by two different people. I'd like my dataframe to have three columns: Code, Transcriber1, Transcriber 2. Under "Transcriber1" would appear the error scores of the first transcriber, whatever the name is, and under "Transcriber2", the scores for the second. That is, I'd like it to look like this:
Any thoughts? Any help will be very much appreciated!
Thanks community!
1) dplyr/tidyr Assuming input DF is as in the Note at the end create a Transcriber column with values Transcriber1 and Transcriber2 and a Seq column with sequence numbers and finally use spread to convert to wide form.
library(dplyr)
library(tidyr)
DF %>%
group_by(Code) %>%
mutate(Transcriber = as.numeric(factor(Transcriber, levels = unique(Transcriber)))) %>%
group_by(Transcriber = paste0("Transcriber", Transcriber), add = TRUE) %>%
mutate(Seq = seq_along(Errors)) %>%
ungroup %>%
spread(Transcriber, Errors) %>%
select(-Seq)
giving:
# A tibble: 14 x 3
Code Transcriber1 Transcriber2
<dbl> <int> <int>
1 1011 1 8
2 1011 2 9
3 1011 3 10
4 1011 4 11
5 1011 5 12
6 1011 6 13
7 1011 7 14
8 2011 15 22
9 2011 16 23
10 2011 17 24
11 2011 18 25
12 2011 19 26
13 2011 20 27
14 2011 21 28
2) Base R A solution using only base R would be:
make_factor <- function(x) factor(x, levels = unique(x))
DF2 <- transform(DF,
Transcriber = paste0("Transcriber", ave(as.numeric(Transcriber), Code, FUN = make_factor)),
Seq = ave(Errors, Code, Transcriber, FUN = seq_along))
r <- reshape(DF2, dir = "wide", idvar = c("Seq", "Code"), timevar = "Transcriber")[-2]
names(r) <- sub("Errors.", "", names(r))
Note
The input in reproducible form is assumed to be:
DF <- data.frame(Code = rep(c(1011, 2011), each = 14),
Transcriber = rep(c("Anna", "David", "Susan", "Anna"), each = 7),
Errors = 1:28)
I have a question that I find kind of hard to explain with a MRE and in an easy
way to answer, mostly because I don't fully understand where the problem lies
myself. So that's my sorry for being vague preamble.
I have a tibble with many sample and reference measurements, for which I want
to do some linear interpolation for each sample. I do this now by taking out
all the reference measurements, rescaling them to sample measurements using
approx, and then patching it back in. But because I take it out first, I
cannot do it nicely in a group_by dplyr pipe way. right now I do it with a
really ugly workaround where I add empty (NA) newly created columns to the
sample tibble, then do it with a for-loop.
So my question is really: how can I implement the approx part within groups
into the pipe, so that I can do everything within groups? I've experimented
with dplyr::do(), and ran into the vignette on "programming with dplyr", but
searching mostly gives me broom::augment and lm stuff that I think operates
differently... (e.g. see
Using approx() with groups in dplyr). This thread also seems promising: How do you use approx() inside of mutate_at()?
Somebody on irc recommended using a conditional mutate, with case_when, but I
don't fully understand where and how within this context yet.
I think the problem lies in the fact that I want to filter out part of the data
for the following mutate operations, but the mutate operations rely on the
grouped data that I just filtered out, if that makes any sense.
Here's a MWE:
library(tidyverse) # or just dplyr, tibble
# create fake data
data <- data.frame(
# in reality a dttm with the measurement time
timestamp = c(rep("a", 7), rep("b", 7), rep("c", 7)),
# measurement cycle, normally 40 for sample, 41 for reference
cycle = rep(c(rep(1:3, 2), 4), 3),
# wheather the measurement is a reference or a sample
isref = rep(c(rep(FALSE, 3), rep(TRUE, 4)), 3),
# measurement intensity for mass 44
r44 = c(28:26, 30:26, 36, 33, 31, 38, 34, 33, 31, 18, 16, 15, 19, 18, 17)) %>%
# measurement intensity for mass 45, normally also masses up to mass 49
mutate(r45 = r44 + rnorm(21, 20))
# of course this could be tidied up to "intensity" with a new column "mass"
# (44, 45, ...), but that would make making comparisons even harder...
# overview plot
data %>%
ggplot(aes(x = cycle, y = r44, colour = isref)) +
geom_line() +
geom_line(aes(y = r45), linetype = 2) +
geom_point() +
geom_point(aes(y = r45), shape = 1) +
facet_grid(~ timestamp)
# what I would like to do
data %>%
group_by(timestamp) %>%
do(target_cycle = approx(x = data %>% filter(isref) %>% pull(r44),
y = data %>% filter(isref) %>% pull(cycle),
xout = data %>% filter(!isref) %>% pull(r44))$y) %>%
unnest()
# immediately append this new column to the original dataframe for all the
# samples (!isref) and then apply another approx for those values.
# here's my current attempt for one of the timestamps
matchref <- function(dat) {
# split the data into sample gas and reference gas
ref <- filter(dat, isref)
smp <- filter(dat, !isref)
# calculate the "target cycle", the points at which the reference intensity
# 44 matches the sample intensity 44 with linear interpolation
target_cycle <- approx(x = ref$r44,
y = ref$cycle, xout = smp$r44)
# append the target cycle to the sample gas
smp <- smp %>%
group_by(timestamp) %>%
mutate(target = target_cycle$y)
# linearly interpolate each reference gas to the target cycle
ref <- ref %>%
group_by(timestamp) %>%
# this is needed because the reference has one more cycle
mutate(target = c(target_cycle$y, NA)) %>%
# filter out all the failed ones (no interpolation possible)
filter(!is.na(target)) %>%
# calculate interpolated value based on r44 interpolation (i.e., don't
# actually interpolate this value but shift it based on the 44
# interpolation)
mutate(r44 = approx(x = cycle, y = r44, xout = target)$y,
r45 = approx(x = cycle, y = r45, xout = target)$y) %>%
select(timestamp, target, r44:r45)
# add new reference gas intensities to the correct sample gasses by the target cycle
left_join(smp, ref, by = c("time", "target"))
}
matchref(data)
# and because now "target" must be length 3 (the group size) or one, not 9
# I have to create this ugly for-loop
# for which I create a copy of data that has the new columns to be created
mr <- data %>%
# filter the sample gasses (since we convert ref to sample)
filter(!isref) %>%
# add empty new columns
mutate(target = NA, r44 = NA, r45 = NA)
# apply matchref for each group timestamp
for (grp in unique(data$timestamp)) {
mr[mr$timestamp == grp, ] <- matchref(data %>% filter(timestamp == grp))
}
Here's one approach that spreads the references and samples to new columns. I drop r45 for simplicity in this example.
data %>%
select(-r45) %>%
mutate(isref = ifelse(isref, "REF", "SAMP")) %>%
spread(isref, r44) %>%
group_by(timestamp) %>%
mutate(target_cycle = approx(x = REF, y = cycle, xout = SAMP)$y) %>%
ungroup
gives,
# timestamp cycle REF SAMP target_cycle
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 a 1 30 28 3
# 2 a 2 29 27 4
# 3 a 3 28 26 NA
# 4 a 4 27 NA NA
# 5 b 1 31 26 NA
# 6 b 2 38 36 2.5
# 7 b 3 34 33 4
# 8 b 4 33 NA NA
# 9 c 1 15 31 NA
# 10 c 2 19 18 3
# 11 c 3 18 16 2.5
# 12 c 4 17 NA NA
Edit to address comment below
To retain r45 you can use a gather-unite-spread approach like this:
df %>%
mutate(isref = ifelse(isref, "REF", "SAMP")) %>%
gather(r, value, r44:r45) %>%
unite(ru, r, isref, sep = "_") %>%
spread(ru, value) %>%
group_by(timestamp) %>%
mutate(target_cycle_r44 = approx(x = r44_REF, y = cycle, xout = r44_SAMP)$y) %>%
ungroup
giving,
# # A tibble: 12 x 7
# timestamp cycle r44_REF r44_SAMP r45_REF r45_SAMP target_cycle_r44
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 a 1 30 28 49.5 47.2 3
# 2 a 2 29 27 48.8 48.7 4
# 3 a 3 28 26 47.2 46.8 NA
# 4 a 4 27 NA 47.9 NA NA
# 5 b 1 31 26 51.4 45.7 NA
# 6 b 2 38 36 57.5 55.9 2.5
# 7 b 3 34 33 54.3 52.4 4
# 8 b 4 33 NA 52.0 NA NA
# 9 c 1 15 31 36.0 51.7 NA
# 10 c 2 19 18 39.1 37.9 3
# 11 c 3 18 16 39.2 35.3 2.5
# 12 c 4 17 NA 39.0 NA NA
I would like to return a dataframe with the minimum value of column one based on the values of columns 2-4:
df <- data.frame(one = rnorm(1000),
two = sample(letters, 1000, replace = T),
three = sample(letters, 1000, replace = T),
four = sample(letters, 1000, replace = T))
I can do:
df_group <- df %>%
group_by(two) %>%
filter(one = min(one))
This gets me the lowest value of all the "m's" in column two, but what if column three or four had a lower "m" value in column one?
The output should look like this:
one two
1 -0.311609752 r
2 0.053166742 n
3 1.546485810 a
4 -0.430308725 d
5 -0.145428664 c
6 0.419181639 u
7 0.008881661 i
8 1.223517580 t
9 0.797273157 b
10 0.790565358 v
11 -0.560031797 e
12 -1.546234090 q
13 -1.847945540 l
14 -1.489130228 z
15 -1.203255034 g
16 0.146969892 m
17 -0.552363433 f
18 -0.006234646 w
19 0.982932856 s
20 0.751936728 o
21 0.220751258 h
22 -1.557436228 y
23 -2.034885868 k
24 -0.463354387 j
25 -0.351448850 p
26 1.331365941 x
I don't care which column has the lowest value for a given letter, I just need the lowest value and the letter column.
I'm trying to wrap my head around writing this simplistically. This might be a duplicate, but I didn't know how to word the title and couldn't find any material or previous questions on how to do it.
Another solution based in data.table :
library(data.table)
setDT(df)
melt(df,
measure=grep("one",names(df),invert = TRUE,value=TRUE))[
,min(one),value]
You can do something like this:
library(dplyr); library(tidyr)
df %>% gather(cols, letts, -one) %>% # gather all letters into one column
group_by(letts) %>%
summarise(one = min(one)) # do a group by summary for each letter
# A tibble: 26 × 2
# letts one
# <chr> <dbl>
#1 a -2.092327
#2 b -2.461102
#3 c -3.055858
#4 d -2.092327
#5 e -2.461102
#6 f -2.249439
#7 g -1.941632
#8 h -2.543310
#9 i -3.055858
#10 j -1.896974
# ... with 16 more rows