For loop for updating data.frame - r

I am trying to write a "for loop" to update my R data frame by iterating.
Here is my code:
datalist = list()
for (i in 1:5) {
dat <- data.frame(ID=LETTERS[seq( from = 1, to = 20 )],nutrition=rnorm(20, mean=50, sd=10),
Stage=c(rep("A1",5), rep("B1",15)))
dat$ADG<-dat$nutrition*0.05
dat$M_weight<-dat$nutrition*0.5+dat$ADG*100
dat$Age<-dat$M_weight*1.1+dat$ADG*0.6
dat$Stage<-as.character(dat$Stage)
dat$Stage[dat$ADG>=3]<-"C1"
dat$i <- i # maybe you want to keep track of which iteration produced it?
datalist[[i]] <- dat # add it to your list #
}
big_data = do.call(rbind, datalist)
From iteration 2, I would like to have "Stage" updated to "C1" if ADG is equal or greater than 3 but this would not apply to iteration 1.
Thank you so much! I appreciate any replies!

I think you want a recursive function instead of an iterative one
Your data stringsAsFactors=F
dat <- data.frame(ID=LETTERS[seq( from = 1, to = 20 )], nutrition=rnorm(20, mean=50, sd=10), Stage=c(rep("A1",5), rep("B1",15)), stringsAsFactors=F)
Use tidyverse for dplyr and purrr verbs
library(tidyverse)
special <- function( dat, counter, end ) {
dat1 <- dat %>%
mutate(ADG = nutrition*0.05) %>%
mutate(M_weight = nutrition*0.5 + ADG*100) %>%
mutate(Age = M_weight*1.1 + ADG*0.6) %>%
mutate(Stage = ifelse( ADG >= 3, "C1", Stage )) %>%
mutate(i=counter)
if (counter < end) {
special(dat1, counter+1, end)
} else {
return(dat1)
}
}
desired <- map_df(2:5, ~special(dat,1,.x))
head(desired)
ID nutrition Stage ADG M_weight Age i
1 A 47.17826 A1 2.358913 259.4804 286.8438 2
2 B 64.55988 C1 3.227994 355.0794 392.5241 2
3 C 52.29020 A1 2.614510 287.5961 317.9244 2
4 D 59.96544 A1 2.998272 329.8099 364.5899 2
Let me know if this is not the output you were expecting

Related

(R) Use dplyr::filter when the condition is a string

I'm having an issue with dplyr::filter
I have a list of dataframes, such as :
df<- data.frame(a =1:3, b = 3:1, level= letters[1:3])
df2 <- data.frame(a =1:6, b = 21:26, level= letters[1:3])
listofdf <- list(df,df2)
I'm trying to create a function to select specific rows in my dataframe, this means that the argument I put in as the column name is necessarily a string.
Function looks something like this, it is meant to be used on a list of dataframes :
selectOTUlevel <- function(OTU,DATA,column){
for (i in 1:length(DATA)){
DATA <- DATA %>% filter(column == OTU)
}
return(DATA)
}
I've tried creating the same function another way :
selectOTU <- function(OTU,DATA,column){
for (i in 1:length(DATA)){
DATA[[i]] <- DATA[[i]][DATA[[i]]$column == OTU,]
}
return(DATA)
}
But I can't seem to solve this issue. I've tried this solution (https://www.r-bloggers.com/2020/09/using-dplyrfilter-when-the-condition-is-a-string/) but it doesn't work either.
Maybe if someone could enlighten me about what I'm doing wrong, I would be delighted !
Take a look at this example: You can use get() in your function to reference it through a string
library(dplyr)
df <- data.frame(a = 1:3, b = 3:1, level= letters[1:3])
filter_data <- function(data, column_name, filter_value){
data %>% filter(get(column_name) == filter_value)
}
filter_data(data = df,
column_name = "level",
filter_value = "b")
Result:
a b level
1 2 2 b
Your argument doesn't necessarily need to be a string to select a column name using dplyr functions. rlang::enquo(), then !! when you use the variable, allows you to pass it as an unquoted column name the same way you would do it directly to filter:
(modified function slightly)
library(tidyverse)
df<- data.frame(a =1:3, b = 3:1, level= letters[1:3])
df2 <- data.frame(a =1:6, b = 21:26, level= letters[1:3])
listofdf <- list(df,df2)
selectOTUlevel <- function(OTU,DATA,column){
column <- rlang::enquo(column)
D_OUT <- list()
for (i in 1:length(DATA)){
D_OUT[[i]] <- DATA[[i]] %>% filter(!!column == OTU)
}
return(D_OUT)
}
selectOTUlevel("a", listofdf, level)
#> [[1]]
#> a b level
#> 1 1 3 a
#>
#> [[2]]
#> a b level
#> 1 1 21 a
#> 2 4 24 a
Created on 2022-03-25 by the reprex package (v2.0.1)

Creating group ids by comparing values of two variables across rows: in R

I have a dataframe with two variables (start,end). would like to create an identifier variable which grows in ascending order of start and, most importantly, is kept constant if the value of start coincides with end of any other row in the dataframe.
Below is a simple example of the data
toy_data <- data.frame(start = c(1,5,6,10,16),
end = c(10,9,11,15,17))
The output I would be looking for is the following:
output_data <- data.frame(start = c(1,10,5,6,16),
end = c(10,15,9,11,17),
NEW_VAR = c(1,1,2,3,4))
You could try adapting this answer to group by ranges that are adjacent to each other. Credit goes entirely to #r2evans.
In this case, you would use expand.grid to get combinations of start and end. Instead of labels you would have row numbers rn to reference.
In the end, you can number the groups based on which rows appear together in the list. The last few lines starting with enframe use tibble/tidyverse. To match the group numbers I resorted the results too.
I hope this might be helpful.
library(tidyverse)
toy_data <- data.frame(start = c(1,5,6,10,16),
end = c(10,9,11,15,17))
toy_data$rn = 1:nrow(toy_data)
eg <- expand.grid(a = seq_len(nrow(toy_data)), b = seq_len(nrow(toy_data)))
eg <- eg[eg$a < eg$b,]
together <- cbind(
setNames(toy_data[eg$a,], paste0(names(toy_data), "1")),
setNames(toy_data[eg$b,], paste0(names(toy_data), "2"))
)
together <- subset(together, end1 == start2)
groups <- split(together$rn2, together$rn1)
for (i in toy_data$rn) {
ind <- (i == names(groups)) | sapply(groups, `%in%`, x = i)
vals <- groups[ind]
groups <- c(
setNames(list(unique(c(i, names(vals), unlist(vals)))), i),
groups[!ind]
)
}
min_row <- as.numeric(sapply(groups, min))
ctr <- seq_along(groups)
lapply(ctr[order(match(min_row, ctr))], \(x) toy_data[toy_data$rn %in% groups[[x]], ]) %>%
enframe() %>%
unnest(col = value) %>%
select(-rn)
Output
name start end
<int> <dbl> <dbl>
1 1 1 10
2 1 10 15
3 2 5 9
4 3 6 11
5 4 16 17
The following function should give you the desired identifier variable NEW_VAR.
identifier <- \(df) {
x <- array(0L, dim = nrow(df))
count <- 0L
my_seq <- seq_len(nrow(df))
for (i in my_seq) {
if(!df[i,]$start %in% df$end) {
x[i] <- my_seq[i] + count
} else {
x[i] <- my_seq[i]-1L + count
count <- count - 1L
}
}
x
}
Examples
# your example
toy_data <- data.frame(start = c(1,10,5,6,16),
end = c(10,15,9,11,17))
toy_data$NEW_VAR <- identifier(toy_data)
# ---------------------
> toy_data$NEW_VAR
[1] 1 1 2 3 4
# other example
toy_data <- data.frame(start = c(1, 2, 2, 4, 16, 21, 18, 3),
end = c(16, 2, 21, 2, 2, 2, 3, 1))
toy_data$NEW_VAR <- identifier(toy_data)
# ---------------------
> toy_data$NEW_VAR
[1] 0 0 0 1 1 1 2 2

Subset data.frame based on lag between two columns

Suppose you want to subset a data.frame where the rule for keeping rows is based
on a lag beteen rows 'a' and 'b':
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
#output
a b
1 1 0
2 0 1
3 0 1
4 1 0
5 0 1
6 0 1
Essentially, if 'a' = 1 you want to keep that row as well as the subsequent run of rows in
'b' that have a value of 1. This capture continues until the next row with a = 0 & b = 0.
I've tried using nested 'ifelse()' statements, but I am stuck incorporate logical tests based on a lag issue.
Suggestions?
This is how I would do it. There are probably options out there that require maybe 1 or 2 lines less.
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
library(dplyr)
df %>%
mutate(grp = cumsum(a==1|a+b==0)) %>%
group_by(grp) %>%
filter(any(a == 1)) %>%
ungroup() %>%
select(a, b)
A solution without dplyr. Work with a flag:
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
# create new empty df
new_df <- read.table(text = "", col.names = c("a", "b"))
a_okay = FALSE # initialize the flag
for (row_number in seq(1:nrow(df))) { # loop over each row of the original df
# if a is 1, we add the row to the new df and set the flag to TRUE
if (df[row_number, "a"] == 1) {
a_okay = TRUE
new_df[nrow(new_df) + 1, ] = c(df[row_number, "a"], df[row_number, "b"])
}
# now we consider the rows where a is not 1
else {
# if b is 1 and we are still following an a == 1: add the row
if (df[row_number, "b"] == 1 & a_okay) {
new_df[nrow(new_df) + 1, ] = c(df[row_number, "a"], df[row_number, "b"])
}
# if b is 0, we reset the flag
else {
a_okay = FALSE
}
}
}
Another base solution inspired by this post, #Wietse de Vries's answer and #Ben's comment.
# input
df <- data.frame(a = c(1,0,0,0,1,0,0,0,0,0,0,0),
b = c(0,1,1,0,0,1,1,0,0,0,1,1))
# identify groups
df$grp <- cumsum(df$a == 1 | df$b == 0)
# subset df by groups with first element of a == 1
df <- do.call(rbind, split(df, df$grp)[by(df, df$grp, function(x) {x$a[1] == 1})])
# remove grp
df$grp <- NULL

Setting colnames of several data frames based on a list variable

I have a list of multiple data frames which are built the same way. I would like to change the name of the 1 column of each data frame to the name of the data frame itself and append some text. From several different answers I figured lapply and working on lists would be the best way to go.
Example data:
df1 <- data.frame(A = 1, B = 2, C = 3)
df2 <- data.frame(A = 1, B = 2, C = 3)
dfList <- list(df1,df2)
col1 <- names(dfList)
df<-lapply(dfList, function(x) {
names(x)[1:2] <- c(col1[1:length(col1)]"appended text","Col2","Col3");x
})
The problem seems to be with calling the correct entry in the "col1" variable for each data frame within my code.
Any ideas on how I should address/ express this correctly? Thanks a lot!
df1<-data.frame(A = 1, B = 2, C = 3)
df2<-data.frame(A = 1, B = 2, C = 3)
dfList <- list(df1=df1,df2=df2)
names(dfList)
col1 <- names(dfList)
for(i in 1:length(dfList))
names(dfList[[names(dfList[i])]])[1]<-names(dfList)[i]
dfList
Here is one option with tidyverse
library(tidyverse)
map(dfList, ~ .x %>%
rename(Aappended_text = A))
If this is based on the column index, create a function
fName <- function(lst, new_name, index){
map(lst, ~
.x %>%
rename_at(index, funs(paste0(., new_name))))
}
fName(dfList, "appended_text", 1)
I'm not sure if I'm understanding your quesiton completely but is tihs what you're after:
df1 <- data.frame(A = 1, B = 2, C = 3)
df2 <- data.frame(A = 1, B = 2, C = 3)
dfList <- list(df1,df2)
df <- lapply(dfList, function(x) {
colnames(x) <- c(paste0(colnames(x)[1], "appended text"), colnames(x)[2:length(colnames(x))])
return(x)
})
Output:
> df
[[1]]
Aappended text B C
1 1 2 3
[[2]]
Aappended text B C
1 1 2 3
You can simply use lapply
lapply(dfList, function(x) {
names(x)[1L] <- "some text"
x
})
But if you want to rename by the name of the data frame elements in your list, first you need to name them e.g. dfList <- list(df1 = df1, df2 = df2) and you can't acces them directly with lapply(x, ... so you need to lapplyover your list by indexes, for example :
lapply(seq_along(dfList), function(i) {
names(dfList[[i]])[1L] <- names(dfList[i])
dfList[[i]]
})

Can't sort column with R

I want to create from the dataset a list that contains word and frequency of the word . I did it and saved into val named 'mylist'. now I want to sort the list according to the frequency of the word and to create barplot from the 10 words that have the higher frequency.
but I not succeeded to sort it. I tried many ways to change the type of 'mylist' to data.frame or date.table but still the column of the frequency stay a list.
To sumup I have the DT var that contains it is a list with 2 columns x-contains the words and type is character .
The 2 column is 'v' - that contains the frequency and it is a list.
I am not succeeding to sort it by the frequency.
please help me.
library(ggplot2)
libary(MASS)
#get the data
data.uri = "http://www.crowdflower.com/wp-content/uploads/2016/03/gender-classifier-DFE-791531.csv"
pwd = getwd()
data.file.name = "gender.csv"
data.file = paste0(pwd, "./", data.file.name)
download.file(data.uri, data.file)
data = read.csv(data.file.name)
#manipulate the data
data <- data[data$X_unit_id < 815719694,]
print(data$X_unit_id)
#get all female has white sidebar
female_colors <- subset(data, data$gender=="female")
female_colors$fav_number
#get all male fav_numbers
male_colors <- subset(data, data$gender=="male")
male_colors$fav_number
text_male = subset(data, data$gender=="male")
text_male = text_male$text
print(text_male[1])
print(length(text_male))
v <- text_male[1:length(text_male)]
print(v)
print (v[1])
count_of_list = 0;
x = list()
for ( i in v) {
# Merge the two lists.
x <- c(x,unlist(strsplit(i," ")))
}
count = 0;
mylist = list()
for (word in x){
for (xWord in x){
if (word == xWord)
count = count + 1;
}
key <- word
value <- count
mylist[[ key ]] <- value
count = 0;
}
libary(data.table)
require(data.table)
DT = data.table(x=c(names(mylist)),v=c(mylist))
DT
As suggested in comments, a reproducible example would be useful in creating an answer to help you. I will suggest a proposal anyway. Try to adapt this peocedure to your data.
Convert your list to a dataframe and use order:
df <- as.data.frame(your.data)
df <- data.frame(id = c("B", "A", "D", "C"), y = c(6, 8, 1, 5))
df
id y
1 B 6
2 A 8
3 D 1
4 C 5
df2 <- df[order(df$id), ]
df2
id y
2 A 8
1 B 6
4 C 5
3 D 1
It looks like you're using a cumbersome way to calculate the word counts, something like this is faster and simpler -
library(dplyr)
foo <- c("ant", "ant", "bat", "dog","egg","ant","bat")
bar <- rnorm(7, 5, 2)
df <- data.frame(foo, bar)
group_by(df, foo) %>% summarise(n = n()) %>% arrange(desc(n))
foo n
(fctr) (int)
1 ant 3
2 bat 2
3 dog 1
4 egg 1

Resources