Column/Line in R - r

In R, i have a table where the column name is a date, how do I invert the columns by rows to be able to record in the database?
Example Table:
estab codigo descricao 2021-02-01 2021-02-02
1 103 4390160 ANM 2003 0 0
2 103 4390161 ANM 2004 MF 0 0
3 103 4390162 ANM 2008 MF 0 0
4 103 4390193 ANM 3004 ST 0 0
5 103 4390189 ANM 3008 ST 0 0
6 103 4543512 ANM 24 NET 0 0
7 103 4390163 AMT 2008 RF 0 0
8 103 4543520 ANM 2003 COM BATERIA 0 0
9 103 4543521 ANM 2004 MF COM BATERIA 0 0
10 103 4543522 ANM 2008 MF COM BATERIA 0 0
11 103 4543523 ANM 3004 ST COM BATERIA 0 0
12 103 4543524 ANM 3008 ST COM BATERIA 0 0
13 103 4543516 AMT 8000 0 0
14 103 4390165 AMT 2018 0 0
15 103 4390164 AMT 2010 0 0
I tried to use melt, but it didn't work very well:
xxx <- reshape2::melt(xxx[[1]], id.vars = 'codigo')

If I understood your question, here is a code that should work for you:
# Tried recreating your dataframe
dt <- data.frame(estab = 103,
codigo = 4390160:4390174,
descricao = c("ANM 2003", "ANM 2004", "ANM BATERIA"),
"2021-02-01" = 0,
"2021-02-02" = 0)
dt <- reshape2::melt(dt, id.vars = c("estab", "codigo", "descricao"), variable.name = "Date", value.name = "Value")
# Make column into date
dt$Date <- gsub("X", "", dt$Date)
dt$Date <- as.Date(dt$Date, format = "%Y.%m.%d")
head(dt)

In base R, we can use reshape
out <- reshape(df1, direction = "long", varying = 4:5, sep = "")
row.names(out) <- NULL
data
df1 <- structure(list(estab = c(103, 103, 103, 103, 103, 103, 103, 103,
103, 103, 103, 103, 103, 103, 103), codigo = 4390160:4390174,
descricao = c("ANM 2003", "ANM 2004", "ANM BATERIA", "ANM 2003",
"ANM 2004", "ANM BATERIA", "ANM 2003", "ANM 2004", "ANM BATERIA",
"ANM 2003", "ANM 2004", "ANM BATERIA", "ANM 2003", "ANM 2004",
"ANM BATERIA"), X2021.02.01 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), X2021.02.02 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-15L))

I think you might be after something like the following.
library(tidyr)
library(lubridate)
pivot_longer(df, matches('\\d{4}-\\d{2}-\\d{2}'),
names_to = 'date',
names_transform = list(date = ymd))
# # A tibble: 6 x 5
# estab codigo descricao date value
# <dbl> <dbl> <chr> <date> <dbl>
# 1 103 4390160 ANM 2003 2021-02-01 0
# 2 103 4390160 ANM 2003 2021-02-02 0
# 3 103 4390161 ANM 2004 MF 2021-02-01 0
# 4 103 4390161 ANM 2004 MF 2021-02-02 0
# 5 103 4390162 ANM 2008 MF 2021-02-01 0
# 6 103 4390162 ANM 2008 MF 2021-02-02 0
Data
df <- structure(list(estab = c(103, 103, 103), codigo = c(4390160,
4390161, 4390162), descricao = c("ANM 2003", "ANM 2004 MF", "ANM 2008 MF"
), `2021-02-01` = c(0, 0, 0), `2021-02-02` = c(0, 0, 0)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))

Related

Manipulating cells within a group based on value in another cell

I have a data.frame in the following format:
CowId Bacillus Week
1234 1 Week1
1234 0 Week2
1234 0 Week3
1234 0 Week4
If a cow is bacillus-positive (yes=1, no=0) on Week1, then I want to change remaining values within this column to 1, as so:
CowId Bacillus Week
1234 1 Week1
1234 1 Week2
1234 1 Week3
1234 1 Week4
I tried the following, but unsure how to proceed after determining the infection status of Week1 cows:
dt %>%
group_by(CowId) %>%
mutate(Bacillus = ifelse(Week == "Week1" & Bacillus, 1,
ifelse(Week != "Week1" do something)
Appreciate any comments/feedback.
Try this approach using any() and testing for the week. I have created dummy data to show the example:
library(dplyr)
library(tidyr)
#Code
df %>% group_by(CowId) %>%
mutate(Bacillus=ifelse(any(Bacillus[Week=='Week1']==1),1,0))
Output:
# A tibble: 8 x 3
# Groups: CowId [2]
CowId Bacillus Week
<dbl> <dbl> <chr>
1 1234 1 Week1
2 1234 1 Week2
3 1234 1 Week3
4 1234 1 Week4
5 1235 0 Week1
6 1235 0 Week2
7 1235 0 Week3
8 1235 0 Week4
Some data used:
#Data
df <- structure(list(CowId = c(1234, 1234, 1234, 1234, 1235, 1235,
1235, 1235), Bacillus = c(1, 0, 0, 0, 0, 0, 0, 0), Week = c("Week1",
"Week2", "Week3", "Week4", "Week1", "Week2", "Week3", "Week4"
)), row.names = c(NA, -8L), class = "data.frame")
In base R, we can create a logical vector with 'Bacillus' & the 'Week' where its value is 'Week1', subset the 'CowId', check whether it is in 'CowId', coerce the logical to binary (+)
df$Bacillus <- with(df, +(CowId %in% unique(CowId[as.logical(Bacillus) &
Week == 'Week1'])))
df$Bacillus
#[1] 1 1 1 1 0 0 0 0
data
df <- structure(list(CowId = c(1234, 1234, 1234, 1234, 1235, 1235,
1235, 1235), Bacillus = c(1, 0, 0, 0, 0, 0, 0, 0), Week = c("Week1",
"Week2", "Week3", "Week4", "Week1", "Week2", "Week3", "Week4"
)), row.names = c(NA, -8L), class = "data.frame")

How to find min and max in dplyr?

I know the sum of points for each person.
I need to know: what is the minimum number of points that a person could have. And what is the maximum number of points that a person could have.
What I have tried:
min_and_max <- dataset %>%
group_by(person) %>%
dplyr::filter(min(sum(points, na.rm = T))) %>%
distinct(person) %>%
pull()
min_and_max
My dataset:
id person points
201 rt99 NA
201 rt99 3
201 rt99 2
202 kt 4
202 kt NA
202 kt NA
203 rr 4
203 rr NA
203 rr NA
204 jk 2
204 jk 2
204 jk NA
322 knm3 5
322 knm3 NA
322 knm3 3
343 kll2 2
343 kll2 1
343 kll2 5
344 kll NA
344 kll 7
344 kll 1
I would suggest this dplyr approach. You have to summarize data like this:
library(tidyverse)
#Code
df %>% group_by(id,person) %>%
summarise(Total=sum(points,na.rm = T),
min=min(points,na.rm = T),
max=max(points,na.rm=T))
Output:
# A tibble: 7 x 5
# Groups: id [7]
id person Total min max
<int> <chr> <int> <int> <int>
1 201 rt99 5 2 3
2 202 kt 4 4 4
3 203 rr 4 4 4
4 204 jk 4 2 2
5 322 knm3 8 3 5
6 343 kll2 8 1 5
7 344 kll 8 1 7
Here is the data.table solution -
dataset[, min_points := min(points, na.rm = T), by = person]
dataset[, max_points := max(points, na.rm = T), by = person]
Since I don't have your data, I cannot test this code, but it should work fine.
The summarize() verb is what you want for this. You don't even need to filter out the NA values first since both min() and max() can have na.rm = TRUE.
library(dplyr)
min_and_max <- dataset %>%
group_by(person) %>%
summarize(min = min(points, na.rm = TRUE),
max = max(points, na.rm = TRUE))
min_and_max
# A tibble: 7 x 3
person min max
<chr> <dbl> <dbl>
1 jk 2 2
2 kll 1 7
3 kll2 1 5
4 knm3 3 5
5 kt 4 4
6 rr 4 4
7 rt99 2 3
dput(dataset)
structure(list(id = c(201, 201, 201, 202, 202, 202, 203, 203,
203, 204, 204, 204, 322, 322, 322, 343, 343, 343, 344, 344, 344
), person = c("rt99", "rt99", "rt99", "kt", "kt", "kt", "rr",
"rr", "rr", "jk", "jk", "jk", "knm3", "knm3", "knm3", "kll2",
"kll2", "kll2", "kll", "kll", "kll"), points = c(NA, 3, 2, 4,
NA, NA, 4, NA, NA, 2, 2, NA, 5, NA, 3, 2, 1, 5, NA, 7, 1)), class = "data.frame", row.names = c(NA,
-21L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), person = structure(list(), class = c("collector_character",
"collector")), points = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

Splitting one column into two columns using data wrangling with R

I would really appreciate your help in using R for data wrangling. I have a data where I want to split one column (variable) into two whenever applicable as conditioned by other variables. For example, as per the sample below, the data represents reactions time measures (RT1 and RT2) of some words (item) that appear in different times of reading (block). I want to see if RT1 and RT2 values in block 3, 4, and 5 are correlated with RT1 and RT2 values of the same item at block 1. The target items that appeared in block 1 and re-appeared in subsequent blocks are coded as 'EI' in the column 'condition', whereas items coded as 'E' or 'I' appeared only once.
dput(d1)
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253
), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I")), row.names = c(NA, -8L
), class = c("tbl_df", "tbl", "data.frame"))
Where a sample of the data would look like this:
> d1
# A tibble: 8 x 6
RECORDING_SESSION_LABEL RT1 RT2 item block condition
<dbl> <dbl> <dbl> <chr> <dbl> <chr>
1 26 5171 357 foreign 1 EI
2 26 3857 328 detailed 1 E
3 26 3447 122 large 1 EI
4 26 314 39 foreign 3 EI
5 26 460 86 foreign 4 EI
6 26 731 132 large 3 EI
7 26 957 173 large 4 EI
8 26 1253 215 disputable 3 I
In order to present in a format that R would understand, the target data frame I want to achieve would be similar to the one below (where the highlighted columns should be added). Rows in blanks at these columns represent items which do not appear repetitively (condition is not coded as 'EI') ; therefore, they are irrelevant and should be coded as 'NA'.
dput(d2)
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), `RT 1` = c(5171, 3857, 3447, 314, 460, 731, 957,
1253), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I"), `RT 1_at_block1` = c(NA,
NA, NA, 5171, 5171, 3447, 3447, NA), RT2_at_block1 = c(NA, NA,
NA, 357, 357, 122, 122, NA)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
And a sample of the data format targeted would look like this:
> d2
# A tibble: 8 x 8
RECORDING_SESSI~ `RT 1` RT2 item block condition `RT 1_at_block1`
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
1 26 5171 357 fore~ 1 EI NA
2 26 3857 328 deta~ 1 E NA
3 26 3447 122 large 1 EI NA
4 26 314 39 fore~ 3 EI 5171
5 26 460 86 fore~ 4 EI 5171
6 26 731 132 large 3 EI 3447
7 26 957 173 large 4 EI 3447
8 26 1253 215 disp~ 3 I NA
# ... with 1 more variable: RT2_at_block1 <dbl>
> head(d2)
# A tibble: 6 x 8
RECORDING_SESSION_LABEL `RT 1` RT2 item block condition `RT 1_at_block1` RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI NA NA
2 26 3857 328 detailed 1 E NA NA
3 26 3447 122 large 1 EI NA NA
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Thanks in advance for any help.
A possible solution using dplyr:
d1 <- structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26, 26, 26, 26),
RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253),
RT2 = c(357, 328, 122, 39, 86, 132, 173, 215),
item = c("foreign", "detailed", "large", "foreign", "foreign", "large", "large", "disputable"),
block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI", "E", "EI", "EI", "EI", "EI", "EI", "I")),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"))
library(dplyr)
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2))
After that, d2 looks like this:
RECORDING_SESSION_LABEL RT1 RT2 item block condition RT1_at_block1 RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI 5171 357
2 26 3857 328 detailed 1 E 3857 328
3 26 3447 122 large 1 EI 3447 122
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Edit: Adding a mutate if you want to set the values for block 1 to NA:
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2)) %>%
mutate(RT1_at_block1 = ifelse(block == 1, NA, RT1_at_block1),
RT2_at_block1 = ifelse(block == 1, NA, RT2_at_block1))

Conditional Replacing with NA in R (two dataframes)

I have
idx <- c(1397, 2000, 3409, 3415, 4077, 4445, 5021, 5155)
idy <- c( 1397, 2000, 2860, 3029, 3415, 3707, 4077, 4445, 5021, 5155,
5251, 5560)
agex <- c(NA, NA, NA, 35, NA, 62, 35, 46)
agey <- c( 3, 45, 0, 89, 7, 2, 13, 24, 58, 8, 3, 45)
dat1 <- as.data.frame(cbind(idx, agex))
dat2 <- as.data.frame(cbind(idy, agey))
Now I want whenever agex = NA, and idx = idy, that agey = NA, so that
idy agey
1 1397 NA
2 2000 NA
3 2860 0
4 3029 89
5 3415 7
6 3707 2
7 4077 NA
8 4445 24
9 5021 58
10 5155 8
11 5251 3
12 5560 45
I have tried this
ifelse(is.na(dat1$agex) | dat1$idx %in% dat2$idy, NA, dat2$agey)
it returns NAs at the correct indices, but shortens idy to the length of idx.
I want whenever agex = NA, and idx = idy, that agey = NA
With a data.table update join...
library(data.table)
setDT(dat1); setDT(dat2)
dat2[dat1[is.na(agex)], on=.(idy = idx), agey := NA]
dat2
idy agey
1: 1397 NA
2: 2000 NA
3: 2860 0
4: 3029 89
5: 3415 7
6: 3707 2
7: 4077 NA
8: 4445 24
9: 5021 58
10: 5155 8
11: 5251 3
12: 5560 45
How it works
dat1[is.na(agex)] is the subset where agex is NA
DT[mDT, on=, j] is a join where rows of mDT are looked up in DT using on=
j is done in the joined subset of DT
when j is k := expr, column k of DT is updated

Row by row application in R [duplicate]

I have my data in the form of a data.table given below
structure(list(atp = c(1, 0, 1, 0, 0, 1), len = c(2, NA, 3, NA,
NA, 1), inv = c(593, 823, 668, 640, 593, 745), GU = c(36, 94,
57, 105, 48, 67), RUTL = c(100, NA, 173, NA, NA, 7)), .Names = c("atp",
"len", "inv", "GU", "RUTL"), row.names = c(NA, -6L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000320788>)
I need to form 4 new columns csi_begin,csi_end, IRQ and csi_order. the value of csi_begin and csi_end when atp=1 depends directly on inv and gu values.
But when atp is not equal to 1 csi_begin and csi_end depends on inv and gu values and IRQ value of previous row
The value of IRQ depends on csi_order of that row if atp==1 else its 0 and csi_order value depends on two rows previous csi_begin value.
I have written the condition with the help of for loop.
Below is the code given
lostsales<-function(transit)
{
if (transit$atp==1)
{
transit$csi_begin[i]<-(transit$inv)[i]
transit$csi_end[i]<-transit$csi_begin[i]-transit$GU[i]
}
else
{
transit$csi_begin[i]<-(transit$inv)[i]+transit$IRQ[i-1]
transit$csi_end[i]<-transit$csi_begin[i]-transit$GU[i]
}
if (transit$csi_begin[i-2]!= NA)
{
transit$csi_order[i]<-transit$csi_begin[i-2]
}
else
{ transit$csi_order[i]<-0}
if (transit$atp==1)
{
transit$IRQ[i]<-transit$csi_order[i]-transit$RUTL[i]
}
else
{
transit$IRQ[i]<-0
}
}
Can anyone help me how to do efficient looping with data.tables using setkeys? As my data set is very large and I cannot use for loop else the timing would be very high.
Adding the desired outcome to your example would be very helpful, as I'm having trouble following the if/then logic. But I took a stab at it anyway:
library(data.table)
# Example data:
dt <- structure(list(atp = c(1, 0, 1, 0, 0, 1), len = c(2, NA, 3, NA, NA, 1), inv = c(593, 823, 668, 640, 593, 745), GU = c(36, 94, 57, 105, 48, 67), RUTL = c(100, NA, 173, NA, NA, 7)), .Names = c("atp", "len", "inv", "GU", "RUTL"), row.names = c(NA, -6L), class = c("data.table", "data.frame"), .internal.selfref = "<pointer: 0x0000000000320788>")
# Add a row number:
dt[,rn:=.I]
# Use this function to get the value from a previous (shiftLen is negative) or future (shiftLen is positive) row:
rowShift <- function(x, shiftLen = 1L) {
r <- (1L + shiftLen):(length(x) + shiftLen)
r[r<1] <- NA
return(x[r])
}
# My attempt to follow the seemingly circular if/then rules:
lostsales2 <- function(transit) {
# If atp==1, set csi_begin to inv and csi_end to csi_begin - GU:
transit[atp==1, `:=`(csi_begin=inv, csi_end=inv-GU)]
# Set csi_order to the value of csi_begin from two rows prior:
transit[, csi_order:=rowShift(csi_begin,-2)]
# Set csi_order to 0 if csi_begin from two rows prior was NA
transit[is.na(csi_order), csi_order:=0]
# Initialize IRQ to 0
transit[, IRQ:=0]
# If ATP==1, set IRQ to csi_order - RUTL
transit[atp==1, IRQ:=csi_order-RUTL]
# If ATP!=1, set csi_begin to inv + IRQ value from previous row, and csi_end to csi_begin - GU
transit[atp!=1, `:=`(csi_begin=inv+rowShift(IRQ,-1), csi_end=inv+rowShift(IRQ,-1)-GU)]
return(transit)
}
lostsales2(dt)
## atp len inv GU RUTL rn csi_begin csi_end csi_order IRQ
## 1: 1 2 593 36 100 1 593 557 0 -100
## 2: 0 NA 823 94 NA 2 NA NA 0 0
## 3: 1 3 668 57 173 3 668 611 593 420
## 4: 0 NA 640 105 NA 4 640 535 0 0
## 5: 0 NA 593 48 NA 5 593 545 668 0
## 6: 1 1 745 67 7 6 745 678 640 633
Is this output close to what you were expecting?

Resources