How to keep sequence of numbers until certain row number reached - r

I have been trying to assign numbers with sequence. I would like to further add to repeat the sequence until the certain row numbers reached. For example repeat the sequence for every 44th row.
Here is what I mean
test_table <- data.frame(col=rep(0:10,each=11), row=c(rev(0:10)))
and assigning cumulative numbers in this way
> library(dplyr)
test_table%>%
mutate(No=(row_number() - 1) %/% 11)
test_table
col row No
1 0 10 0
2 0 9 0
3 0 8 0
4 0 7 0
5 0 6 0
6 0 5 0
7 0 4 0
8 0 3 0
9 0 2 0
10 0 1 0
11 0 0 0
12 1 10 1
13 1 9 1
14 1 8 1
15 1 7 1
16 1 6 1
17 1 5 1
18 1 4 1
19 1 3 1
20 1 2 1
21 1 1 1
22 1 0 1
23 2 10 2
24 2 9 2
25 2 8 2
26 2 7 2
27 2 6 2
28 2 5 2
29 2 4 2
30 2 3 2
31 2 2 2
32 2 1 2
33 2 0 2
34 3 10 3
35 3 9 3
36 3 8 3
37 3 7 3
38 3 6 3
39 3 5 3
40 3 4 3
41 3 3 3
42 3 2 3
43 3 1 3
44 3 0 3
45 4 10 4
46 4 9 4
47 4 8 4
48 4 7 4
49 4 6 4
50 4 5 4
51 4 4 4
52 4 3 4
53 4 2 4
54 4 1 4
55 4 0 4
56 5 10 5
57 5 9 5
58 5 8 5
59 5 7 5
60 5 6 5
61 5 5 5
62 5 4 5
63 5 3 5
64 5 2 5
65 5 1 5
66 5 0 5
67 6 10 6
68 6 9 6
69 6 8 6
70 6 7 6
71 6 6 6
72 6 5 6
73 6 4 6
74 6 3 6
75 6 2 6
76 6 1 6
77 6 0 6
78 7 10 7
79 7 9 7
80 7 8 7
81 7 7 7
82 7 6 7
83 7 5 7
84 7 4 7
85 7 3 7
86 7 2 7
87 7 1 7
88 7 0 7
89 8 10 8
90 8 9 8
91 8 8 8
92 8 7 8
93 8 6 8
94 8 5 8
95 8 4 8
96 8 3 8
97 8 2 8
98 8 1 8
99 8 0 8
100 9 10 9
101 9 9 9
102 9 8 9
103 9 7 9
104 9 6 9
105 9 5 9
106 9 4 9
107 9 3 9
108 9 2 9
109 9 1 9
110 9 0 9
111 10 10 10
112 10 9 10
113 10 8 10
114 10 7 10
115 10 6 10
116 10 5 10
117 10 4 10
118 10 3 10
119 10 2 10
120 10 1 10
121 10 0 10
Ok. Good! But I would like to keep the sequence for example 0 and 1 until the 44th row reached. After that, start to the new sequence from 2 and go 88th row like this.
So the expected output will be
test_table
col row No
1 0 10 0
2 0 9 0
3 0 8 0
4 0 7 0
5 0 6 0
6 0 5 0
7 0 4 0
8 0 3 0
9 0 2 0
10 0 1 0
11 0 0 0
12 1 10 1
13 1 9 1
14 1 8 1
15 1 7 1
16 1 6 1
17 1 5 1
18 1 4 1
19 1 3 1
20 1 2 1
21 1 1 1
22 1 0 1
23 2 10 0
24 2 9 0
25 2 8 0
26 2 7 0
27 2 6 0
28 2 5 0
29 2 4 0
30 2 3 0
31 2 2 0
32 2 1 0
33 2 0 0
34 3 10 1
35 3 9 1
36 3 8 1
37 3 7 1
38 3 6 1
39 3 5 1
40 3 4 1
41 3 3 1
42 3 2 1
43 3 1 1
44 3 0 1
45 4 10 2
46 4 9 2
47 4 8 2
48 4 7 2
49 4 6 2
50 4 5 2
51 4 4 2
52 4 3 2
53 4 2 2
54 4 1 2
55 4 0 2
56 5 10 3
57 5 9 3
58 5 8 3
59 5 7 3
60 5 6 3
61 5 5 3
62 5 4 3
63 5 3 3
64 5 2 3
65 5 1 3
66 5 0 3
67 6 10 2
68 6 9 2
69 6 8 2
70 6 7 2
71 6 6 2
72 6 5 2
73 6 4 2
74 6 3 2
75 6 2 2
76 6 1 2
77 6 0 2
78 7 10 3
79 7 9 3
80 7 8 3
81 7 7 3
82 7 6 3
83 7 5 3
84 7 4 3
85 7 3 3
86 7 2 3
87 7 1 3
88 7 0 3
89 8 10 4
90 8 9 4
91 8 8 4
92 8 7 4
93 8 6 4
94 8 5 4
95 8 4 4
96 8 3 4
97 8 2 4
98 8 1 4
99 8 0 4
100 9 10 5
101 9 9 5
102 9 8 5
103 9 7 5
104 9 6 5
105 9 5 5
106 9 4 5
107 9 3 5
108 9 2 5
109 9 1 5
110 9 0 5
111 10 10 4
112 10 9 4
113 10 8 4
114 10 7 4
115 10 6 4
116 10 5 4
117 10 4 4
118 10 3 4
119 10 2 4
120 10 1 4
121 10 0 4
How can we do that ?
Thanks in advance!

This would do it in more general way
num.seq = 11L # total number of sequences in the first column
num.rows = N * num.seq # total number of rows
seq.length.3 = 44 # length of the pattern in the 3rd column
# number of paterns in the 3rd column
num.seq.3 = ( num.rows - 1 ) %/% seq.length.3 +1
# starting number in the sequence of the 3rd column
nseq=0
# vector for the 3rd column (could be done right in data frame def.)
No = (rep(rep( nseq:(nseq+1), each = N, times= 2), times=num.seq.3) +
rep(0:(num.seq.3 -1)*2, each= seq.length.3)) [1:num.rows]
test_table <- data.frame(col=rep(0:10,each=11),
row=c(rev(0:10)),
No=No)
An alternative way:
library(dplyr)
dt2 <- test_table%>%
mutate(No = (row_number() - 1) %/% 11)
dt2$No <- dt2$No %% 2 + (rep(0:num.seq.3, each =44, times=num.seq.3 )*2)
[1:num.rows]

The arithmetic, which is totally dependent on row numbers, seems right this way.
test_table%>%
mutate(No=((row_number() - 1) %/% 11) %% 2) %>% # alternating 11 rows of 0's and 1's
mutate(No = No + ((row_number() - 1) %/% 44) * 2) # add 2 every after 44 rows
Here is the result, as intended.
structure(list(col = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L), row = c(10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 0L,
10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 0L, 10L, 9L, 8L, 7L,
6L, 5L, 4L, 3L, 2L, 1L, 0L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L,
2L, 1L, 0L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 0L, 10L,
9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 0L, 10L, 9L, 8L, 7L, 6L,
5L, 4L, 3L, 2L, 1L, 0L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L,
1L, 0L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 0L, 10L, 9L,
8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 0L, 10L, 9L, 8L, 7L, 6L, 5L,
4L, 3L, 2L, 1L, 0L), No = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4)), class = "data.frame", .Names = c("col", "row",
"No"), row.names = c(NA, -121L))

This would deliver what I understand to be the requested vector (except I think your sequencing"skipped a beat"):
c( rep( c(1,2,1,2), each=11) , rep(c(3,4,3,4), each=11), rep(c(5,6,5,6), each=11) )
[1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3
[48] 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5
[95] 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6
A more general way:
c( sapply( seq(1, 6, by=2), function(start) {
rep( rep(start:(start+1) , 2), each=11) }))
The outer c() will remove the matrix character that sapply defaults to.

Related

Find the previous-trial score within a nested dataframe in a loop and save as a new variable

I have a dataframe that currently looks like this:
subjectID
Trial
Score
1
1
16
1
1
16
1
1
16
1
2
8
1
2
8
1
2
8
1
3
12
1
3
12
1
3
12
2
1
9
2
1
9
2
1
9
2
2
10
2
2
10
2
2
10
I need to create a new column, Previous_Trial_Score, that is simply the score on the last trial for each person. For example:
subjectID
Trial
Score
Previous_Trial_Score
1
1
16
NA
1
1
16
NA
1
1
16
NA
1
2
8
16
1
2
8
16
1
2
8
16
1
3
12
8
1
3
12
8
1
3
12
8
2
1
9
NA
2
1
9
NA
2
1
9
NA
2
2
10
9
2
2
10
9
2
2
10
9
And so on. Trial 1 for each subject will always be NA, as there is no previous trial for that person. I am writing a for-loop to accomplish this, below:
for (myperson in unique(data$subjectID)){
for (mytrial in unique(data$Trial[data$Trial>1])){
#Specify the trial and person
Prev_Score=as.numeric(unique(data[data$subjectID==myperson & data$Trial==mytrial-1, "Score"]))
#Save it to the dataframe
data[data$subjectID==myperson & data$Trial==mytrial,"Prev_Score"]=Prev_Score
}
}
In the above loop, I had to specify as.numeric and unique to get R to return a single value properly. However, when I run the loop, I get this error:
Error: Assigned data `value` must be compatible with existing data.
i Error occurred for column `Prev_Score`.
x Can't convert from <double> to <logical> due to loss of precision.
* Locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 3...
Does anyone have a solution? I am open to tidyverse/dplyr work-arounds.
We could do a group by summarise to get the lag and then do a join
library(dplyr)
df1 %>%
group_by(subjectID, Trial) %>%
summarise(Previous_Trial_Score = last(Score), .groups= 'drop_last') %>%
mutate(Previous_Trial_Score = lag(Previous_Trial_Score)) %>%
left_join(df1, .)
-output
subjectID Trial Score Previous_Trial_Score
1 1 1 16 NA
2 1 1 16 NA
3 1 1 16 NA
4 1 2 8 16
5 1 2 8 16
6 1 2 8 16
7 1 3 12 8
8 1 3 12 8
9 1 3 12 8
10 2 1 9 NA
11 2 1 9 NA
12 2 1 9 NA
13 2 2 10 9
14 2 2 10 9
15 2 2 10 9
Or slightly compact option with data.table
library(data.table)
setDT(df1)[, Previous_Trial_Score := shift(.SD[, last(Score),
Trial]$V1)[Trial], subjectID]
-output
> df1
subjectID Trial Score Previous_Trial_Score
<int> <int> <int> <int>
1: 1 1 16 NA
2: 1 1 16 NA
3: 1 1 16 NA
4: 1 2 8 16
5: 1 2 8 16
6: 1 2 8 16
7: 1 3 12 8
8: 1 3 12 8
9: 1 3 12 8
10: 2 1 9 NA
11: 2 1 9 NA
12: 2 1 9 NA
13: 2 2 10 9
14: 2 2 10 9
15: 2 2 10 9
data
df1 <- structure(list(subjectID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L), Trial = c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), Score = c(16L, 16L, 16L,
8L, 8L, 8L, 12L, 12L, 12L, 9L, 9L, 9L, 10L, 10L, 10L)),
class = "data.frame", row.names = c(NA,
-15L))
Here is another dplyr option, where we just keep the unique rows with distinct, then get the lag value of Score, and join back to the original dataframe. However, if you have more columns in your dataframe, then #akrun provides a cleaner approach of handling the additional columns.
library(dplyr)
df %>%
distinct() %>%
group_by(subjectID) %>%
mutate(Previous_Trial_Score = lag(Score)) %>%
left_join(df, .)
Output
subjectID Trial Score Previous_Trial_Score
1 1 1 16 NA
2 1 1 16 NA
3 1 1 16 NA
4 1 2 8 16
5 1 2 8 16
6 1 2 8 16
7 1 3 12 8
8 1 3 12 8
9 1 3 12 8
10 2 1 9 NA
11 2 1 9 NA
12 2 1 9 NA
13 2 2 10 9
14 2 2 10 9
15 2 2 10 9
Data
df <- structure(list(subjectID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L), Trial = c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), Score = c(16L, 16L, 16L,
8L, 8L, 8L, 12L, 12L, 12L, 9L, 9L, 9L, 10L, 10L, 10L)), class = "data.frame", row.names = c(NA,
-15L))

Create a new column with some inherent conditions

I would like to create a column called Weight, which would be as follows:
Weight = Mode + (constant * mean).
Mode is column of my database.
Constant is alternative/total alternatives, in this case will be 0,1, because 1/10.
Mean is the average of the values ​​corresponding to a specific alternative. For example, the mean for alternative 10 is (3 + 2 + 2 + 3)/4 = 2,5.
Making an example for alternative 10 then. The weight would be 2 + (0,1.2,5) = 2,25.
database<-structure(list(Alternatives = c(3, 4, 5, 6, 7, 8, 9, 10, 11,
12), Method1 = c(1L, 10L, 7L, 8L, 9L, 6L, 5L, 3L, 4L, 2L), Method2 = c(1L,
8L, 6L, 7L, 10L, 9L, 4L, 2L, 3L, 5L), Method3 = c(1L, 10L, 7L,
8L, 9L, 6L, 4L, 2L, 3L, 5L), Method4 = c(1L, 9L, 6L, 7L, 10L,
8L, 5L, 3L, 4L, 2L), Mode = c(1, 10, 6, 7, 9, 6, 4, 2, 3, 2)), class = "data.frame", row.names = c(NA,-10L))
Alternatives Method1 Method2 Method3 Method4 Mode
1 3 1 1 1 1 1
2 4 10 8 10 9 10
3 5 7 6 7 6 6
4 6 8 7 8 7 7
5 7 9 10 9 10 9
6 8 6 9 6 8 6
7 9 5 4 4 5 4
8 10 3 2 2 3 2
9 11 4 3 3 4 3
10 12 2 5 5 2 2
If your constant equals 0.1 , then try
database |> mutate(Weight = Mode +
.1 * ((Method1 + Method2 + Method3 + Method4) / 4))
Output
Alternatives Method1 Method2 Method3 Method4 Mode Weight
1 3 1 1 1 1 1 1.100
2 4 10 8 10 9 10 10.925
3 5 7 6 7 6 6 6.650
4 6 8 7 8 7 7 7.750
5 7 9 10 9 10 9 9.950
6 8 6 9 6 8 6 6.725
7 9 5 4 4 5 4 4.450
8 10 3 2 2 3 2 2.250
9 11 4 3 3 4 3 3.350
10 12 2 5 5 2 2 2.350
if it is the probability then change it to
database |> mutate(Weight = Mode +
1/ nrow(database) * ((Method1 + Method2 + Method3 + Method4) / 4)) |>
mutate(rank = rank(Weight))
This can also be accomplished using the base package.
database$Weight = .1 * (database$Method1 + database$Method2 + database$Method3 + database$Method4)/4 + database$Mode
database
# Alternatives Method1 Method2 Method3 Method4 Mode Weight
#1 3 1 1 1 1 1 1.100
#2 4 10 8 10 9 10 10.925
#3 5 7 6 7 6 6 6.650
#4 6 8 7 8 7 7 7.750
#5 7 9 10 9 10 9 9.950
#6 8 6 9 6 8 6 6.725
#7 9 5 4 4 5 4 4.450
#8 10 3 2 2 3 2 2.250
#9 11 4 3 3 4 3 3.350
#10 12 2 5 5 2 2 2.350
One can also use the rank function on the resulting "weight" column.
database$weight = rank(database$weight)
Additionally, the with function can be used for convenience:
database$weight = with(database, .1*(Method1 + Method2 + Method3 + Method4)/4 + Mode)

Insert conditions when I have the same values for the mode or when I don't have the mode value

The code below generates the mode value from the values obtained by Methods 1, 2, 3 and 4. But notice that in some cases I have correct mode values, for example, alternatives 3 and 4, but incorrect ones, such as in alternative 5, as it has two values of 7 and two values of 6, but the mode value is showing 6. Furthermore, in alternatives 11 and 12, it has no a mode value, because it has different values for both methods. So for these incorrect cases, that is, when I have 2 equal values for the same alternative and when I have no mode value, I would like to consider the value obtained by Method 1 to be the mode value. I inserted below the correct output.
Executable code below:
database<-structure(list(Alternatives = c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
Method1 = c(1L, 10L, 7L, 8L, 9L, 6L, 5L, 3L, 4L, 2L), Method2 = c(1L,
8L, 6L, 7L, 10L, 9L, 4L, 2L, 5L, 3L), Method3 = c(1L,
10L, 7L, 8L, 9L, 6L, 4L, 2L, 3L, 5L), Method4 = c(1L,
9L, 6L, 7L, 10L, 8L, 5L, 3L, 2L, 4L)), class = "data.frame", row.names = c(NA,
10L))
ModeFunc <- function(Vec) {
tmp <- sort(table(Vec),decreasing = TRUE)
Nms <- names(tmp)
if(max(tmp) > 1) {
as.numeric(Nms[1])
} else NA}
output <- database |> rowwise() |>
mutate(Mode = ModeFunc(c_across(Method1:Method4))) %>%
data.frame()
> output
Alternatives Method1 Method2 Method3 Method4 Mode
1 3 1 1 1 1 1
2 4 10 8 10 9 10
3 5 7 6 7 6 6
4 6 8 7 8 7 7
5 7 9 10 9 10 9
6 8 6 9 6 8 6
7 9 5 4 4 5 4
8 10 3 2 2 3 2
9 11 4 5 3 2 NA
10 12 2 3 5 4 NA
The correct output would then be:
Alternatives Method1 Method2 Method3 Method4 Mode
3 1 1 1 1 1
4 10 8 10 9 10
5 7 6 7 6 7
6 8 7 8 7 8
7 9 10 9 10 9
8 6 9 6 8 6
9 5 4 4 5 5
10 3 2 2 3 3
11 4 5 3 2 4
12 2 3 5 4 2
You could use some conventional mode() function,
mode <- function(x) {
ux <- unique(x)
tb <- tabulate(match(x, ux))
ux[tb == max(tb)]
}
and update values using ifelse in mapply.
mds <- apply(database[-1], 1, mode) |> setNames(database$Alternatives)
mapply(\(x, y) ifelse(length(x) > 1, y, x), mds, database$Method1)
# 3 4 5 6 7 8 9 10 11 12
# 1 10 7 8 9 6 5 3 4 2
So, altogether it could look like this:
database |>
cbind(Mode=mapply(\(x, y) ifelse(length(x) > 1, y, x),
apply(database[-1], 1, mode),
database$Method1))
# Alternatives Method1 Method2 Method3 Method4 Mode
# 1 3 1 1 1 1 1
# 2 4 10 8 10 9 10
# 3 5 7 6 7 6 7
# 4 6 8 7 8 7 8
# 5 7 9 10 9 10 9
# 6 8 6 9 6 8 6
# 7 9 5 4 4 5 5
# 8 10 3 2 2 3 3
# 9 11 4 5 3 2 4
# 10 12 2 3 5 4 2

Define new variable to take on 1 if next row of another variable fulfills condition

so I´m trying to set up my dataset for event-history analysis and for this I need to define a new column. My dataset is of the following form:
ID Var1
1 10
1 20
1 30
1 10
2 4
2 5
2 10
2 5
3 1
3 15
3 20
3 9
4 18
4 32
4 NA
4 12
5 2
5 NA
5 8
5 3
And I want to get to the following form:
ID Var1 Var2
1 10 0
1 20 0
1 30 1
1 10 0
2 4 0
2 5 0
2 10 0
2 5 0
3 1 0
3 15 0
3 20 1
3 9 0
4 18 0
4 32 NA
4 NA 1
4 12 0
5 2 NA
5 NA 0
5 8 1
5 3 0
So in words: I want the new variable to indicate, if the value of Var1 (with respect to the group) drops below 50% of the maximum value Var1 reaches for that group. Whether the last value is NA or 0 is not really of importance, although NA would make more sense from a theoretical perspective.
I´ve tried using something like
DF$Var2 <- df %>%
group_by(ID) %>%
ifelse(df == ave(df$Var1,df$ID, FUN = max), 0,1)
to then lag it by 1, but it returns an error on an unused argument 1 in ifelse.
Thanks for your solutions!
Here is a base R option via ave + cummax
within(df,Var2 <- ave(Var1,ID,FUN = function(x) c((x<max(x)/2 & cummax(x)==max(x))[-1],0)))
which gives
> within(df,Var2 <- ave(Var1,ID,FUN = function(x) c((x<max(x)/2 & cummax(x)==max(x))[-1],0)))
ID Var1 Var2
1 1 10 0
2 1 20 0
3 1 30 1
4 1 10 0
5 2 4 0
6 2 5 0
7 2 10 0
8 2 5 0
9 3 1 0
10 3 15 0
11 3 20 1
12 3 9 0
Data
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), Var1 = c(10L, 20L, 30L, 10L, 4L, 5L, 10L, 5L, 1L, 15L,
20L, 9L)), class = "data.frame", row.names = c(NA, -12L))
Edit (for updated post)
f <- function(v) {
u1 <- c(replace(v,!is.na(v),0),0)[-1]
v[is.na(v)] <- v[which(is.na(v))-1]
u2 <- c((v<max(v)/2 & cummax(v)==max(v))[-1],0)
u1+u2
}
within(df,Var2 <- ave(Var1,ID,FUN = f))
such that
> within(df,Var2 <- ave(Var1,ID,FUN = f))
ID Var1 Var2
1 1 10 0
2 1 20 0
3 1 30 1
4 1 10 0
5 2 4 0
6 2 5 0
7 2 10 0
8 2 5 0
9 3 1 0
10 3 15 0
11 3 20 1
12 3 9 0
13 4 18 0
14 4 32 NA
15 4 NA 1
16 4 12 0
17 5 2 NA
18 5 NA 0
19 5 8 1
20 5 3 0
Data
df <- tructure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), Var1 = c(10L, 20L, 30L,
10L, 4L, 5L, 10L, 5L, 1L, 15L, 20L, 9L, 18L, 32L, NA, 12L, 2L,
NA, 8L, 3L)), class = "data.frame", row.names = c(NA, -20L))

Dataframe, split by column values and put into new columns

I am new to R, and currently working with setting up my data.
My data comes in a format where I each row contains a single measurement (DV), and a column with an explanation for the type of measurement (DVID).
Here is an example of my data:
ID TIME DV DVID
1 0 0.0 7
1 1 27.5 1
1 1 0.0 7
1 4 19.6 1
1 4 0.0 7
1 8 17.9 1
1 8 0.0 7
1 12 17.7 1
1 12 0.0 7
1 24 19.6 1
1 24 0.0 7
1 48 32.9 1
1 48 0.0 7
2 0 0.0 7
2 1 0.0 7
2 4 0.0 7
2 8 0.0 7
2 12 0.0 7
2 24 0.0 7
2 48 27.3 1
2 72 30.9 1
2 72 0.0 7
2 96 20.8 1
3 0 1.0 7
3 1 7.0 1
3 1 0.0 7
3 4 15.0 1
3 4 0.0 7
3 8 27.2 1
3 8 0.0 7
3 12 0.0 7
3 24 47.0 1
3 24 0.0 7
3 48 65.4 1
3 48 0.0 7
3 72 68.7 1
3 72 0.0 7
3 96 82.8 1
3 96 0.0 7
3 120 70.5 1
What I want to do is to "pair together" the different types of measurements, so I have one column with the measurements that is one type (DVID=1) and another column with the measurements that is another type (DVID=7).
I also need to delete the measurements where I don't have both type of measurements (or, alternatively, put in NA in these fields)
An example of this looks like:
ID TIME DV_1 DV_7
1 1 27.5 0
1 4 19.6 0
1 8 17.9 0
1 12 17.7 0
1 24 19.6 0
1 48 32.9 0
The purpose is that I want to be able to plot the DVID = 1 values against the DVID = 7 values.
Can anyone here help me with doing this?
I now that i probably have to use functions in the split and apply family, but I have no idea about where to start.
Thanks in advance!
Here is one approach.
library(dplyr)
library(tidyr)
#Create one column for group1 and another for group7 in DVID
ana <- spread(foo, DVID, DV)
colnames(ana) <- c("ID", "TIME", "DV1", "DV7")
# Remove rows which have NA
filter(ana, !DV1 %in% NA & !DV7 %in% NA)
# ID TIME DV1 DV7
#1 1 1 27.5 0
#2 1 4 19.6 0
#3 1 8 17.9 0
#4 1 12 17.7 0
#5 1 24 19.6 0
#6 1 48 32.9 0
#7 2 72 30.9 0
#8 3 1 7.0 0
#9 3 4 15.0 0
#10 3 8 27.2 0
#11 3 24 47.0 0
#12 3 48 65.4 0
#13 3 72 68.7 0
#14 3 96 82.8 0
Another way could be this given you convert your data frame to data.table
setDT(foo)
bob <- dcast.data.table(foo, ID + TIME ~ DVID, value.var = "DV")
setnames(bob, c("1","7"), c("DV1", "DV7"))[!DV1 %in% NA & !DV7 %in% NA, ]
Update
Given #Arun's advice, the 3rd line can be like this using data.table 1.9.5
na.omit(bob, by=c("1", "7"))
You appear to be wanting to reshape your data. Use cast from the reshape package.
library(reshape)
# read data
dfX = read.table(textConnection("ID TIME DV DVID
1 0 0.0 7
1 1 27.5 1
1 1 0.0 7
1 4 19.6 1
1 4 0.0 7
1 8 17.9 1
1 8 0.0 7
1 12 17.7 1
1 12 0.0 7
1 24 19.6 1
1 24 0.0 7
1 48 32.9 1
1 48 0.0 7
2 0 0.0 7
2 1 0.0 7
2 4 0.0 7
2 8 0.0 7
2 12 0.0 7
2 24 0.0 7
2 48 27.3 1
2 72 30.9 1
2 72 0.0 7
2 96 20.8 1
3 0 1.0 7
3 1 7.0 1
3 1 0.0 7
3 4 15.0 1
3 4 0.0 7
3 8 27.2 1
3 8 0.0 7
3 12 0.0 7
3 24 47.0 1
3 24 0.0 7
3 48 65.4 1
3 48 0.0 7
3 72 68.7 1
3 72 0.0 7
3 96 82.8 1
3 96 0.0 7
3 120 70.5 1"), header = TRUE)
# reshape the data
reshape::cast(dfX, ID + TIME ~ DVID, value = "DV")
Here is the output:
> reshape::cast(dfX, ID + TIME ~ DVID, value = "DV")
ID TIME 1 7
1 1 0 NA 0
2 1 1 27.5 0
3 1 4 19.6 0
4 1 8 17.9 0
5 1 12 17.7 0
6 1 24 19.6 0
7 1 48 32.9 0
8 2 0 NA 0
9 2 1 NA 0
10 2 4 NA 0
11 2 8 NA 0
12 2 12 NA 0
13 2 24 NA 0
14 2 48 27.3 NA
15 2 72 30.9 0
16 2 96 20.8 NA
17 3 0 NA 1
18 3 1 7.0 0
19 3 4 15.0 0
20 3 8 27.2 0
21 3 12 NA 0
22 3 24 47.0 0
23 3 48 65.4 0
24 3 72 68.7 0
25 3 96 82.8 0
26 3 120 70.5 NA
In addition, you could use reshape from base R
na.omit(reshape(df, idvar = c("ID","TIME"),
timevar="DVID", direction = "wide"))[,c(1:2,4:3)]
# ID TIME DV.1 DV.7
#2 1 1 27.5 0
#4 1 4 19.6 0
#6 1 8 17.9 0
#8 1 12 17.7 0
#10 1 24 19.6 0
#12 1 48 32.9 0
#21 2 72 30.9 0
#25 3 1 7.0 0
#27 3 4 15.0 0
#29 3 8 27.2 0
#32 3 24 47.0 0
#34 3 48 65.4 0
#36 3 72 68.7 0
#38 3 96 82.8 0
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), TIME = c(0L,
1L, 1L, 4L, 4L, 8L, 8L, 12L, 12L, 24L, 24L, 48L, 48L, 0L, 1L,
4L, 8L, 12L, 24L, 48L, 72L, 72L, 96L, 0L, 1L, 1L, 4L, 4L, 8L,
8L, 12L, 24L, 24L, 48L, 48L, 72L, 72L, 96L, 96L, 120L), DV = c(0,
27.5, 0, 19.6, 0, 17.9, 0, 17.7, 0, 19.6, 0, 32.9, 0, 0, 0, 0,
0, 0, 0, 27.3, 30.9, 0, 20.8, 1, 7, 0, 15, 0, 27.2, 0, 0, 47,
0, 65.4, 0, 68.7, 0, 82.8, 0, 70.5), DVID = c(7L, 1L, 7L, 1L,
7L, 1L, 7L, 1L, 7L, 1L, 7L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 1L,
1L, 7L, 1L, 7L, 1L, 7L, 1L, 7L, 1L, 7L, 7L, 1L, 7L, 1L, 7L, 1L,
7L, 1L, 7L, 1L)), .Names = c("ID", "TIME", "DV", "DVID"), class = "data.frame", row.names = c(NA,
-40L))

Resources