Here is my data set. I would like to add 5 new columns to mydata with 5 different conditions.
mydata=data.frame(sub=rep(c(1:4),c(3,4,5,5)),t=c(1:3,1:4,1:5,1:5),
y.val=c(10,20,13,
5,7,8,0,
45,17,25,12,10,
40,0,0,5,8))
mydata
sub t y.val
1 1 1 10
2 1 2 20
3 1 3 13
4 2 1 5
5 2 2 7
6 2 3 8
7 2 4 0
8 3 1 45
9 3 2 17
10 3 3 25
11 3 4 12
12 3 5 10
13 4 1 40
14 4 2 0
15 4 3 0
16 4 4 5
17 4 5 8
I would like to add the following 5 (max of 't' column) columns as
mydata$It1=ifelse(mydata$t==1 & mydata$y.val>0,1,0)
mydata$It2=ifelse(mydata$t==2 & mydata$y.val>0,1,0)
mydata$It3=ifelse(mydata$t==3 & mydata$y.val>0,1,0)
mydata$It4=ifelse(mydata$t==4 & mydata$y.val>0,1,0)
mydata$It5=ifelse(mydata$t==5 & mydata$y.val>0,1,0)
Here is the expected outcome.
> mydata
sub t y.val It1 It2 It3 It4 It5
1 1 1 10 1 0 0 0 0
2 1 2 20 0 1 0 0 0
3 1 3 13 0 0 1 0 0
4 2 1 5 1 0 0 0 0
5 2 2 7 0 1 0 0 0
6 2 3 8 0 0 1 0 0
7 2 4 0 0 0 0 0 0
8 3 1 45 1 0 0 0 0
9 3 2 17 0 1 0 0 0
10 3 3 25 0 0 1 0 0
11 3 4 12 0 0 0 1 0
12 3 5 10 0 0 0 0 1
13 4 1 40 1 0 0 0 0
14 4 2 0 0 0 0 0 0
15 4 3 0 0 0 0 0 0
16 4 4 5 0 0 0 1 0
17 4 5 8 0 0 0 0 1
I appreciate your help if it can be written as a function using for loop or any other technique.
You could use sapply/lapply
n <- seq_len(5)
mydata[paste0("It", n)] <- +(sapply(n, function(x) mydata$t==x & mydata$y.val>0))
mydata
# sub t y.val It1 It2 It3 It4 It5
#1 1 1 10 1 0 0 0 0
#2 1 2 20 0 1 0 0 0
#3 1 3 13 0 0 1 0 0
#4 2 1 5 1 0 0 0 0
#5 2 2 7 0 1 0 0 0
#6 2 3 8 0 0 1 0 0
#7 2 4 0 0 0 0 0 0
#8 3 1 45 1 0 0 0 0
#9 3 2 17 0 1 0 0 0
#10 3 3 25 0 0 1 0 0
#11 3 4 12 0 0 0 1 0
#12 3 5 10 0 0 0 0 1
#13 4 1 40 1 0 0 0 0
#14 4 2 0 0 0 0 0 0
#15 4 3 0 0 0 0 0 0
#16 4 4 5 0 0 0 1 0
#17 4 5 8 0 0 0 0 1
mydata$t==x & mydata$y.val>0 returns a logical value of TRUE/FALSE based on condition. The + changes those logical values to 1/0 respectively. (Try +c(FALSE, TRUE)). It avoids using ifelse i.e ifelse(condition, 1, 0).
Here's another approach based on multiplying a model matrix by the logical y.val > 0.
df <- cbind(mydata[1:3], model.matrix(~ factor(t) + 0, mydata)*(mydata$y.val>0))
Which gives:
sub t y.val factor.t.1 factor.t.2 factor.t.3 factor.t.4 factor.t.5
1 1 1 10 1 0 0 0 0
2 1 2 20 0 1 0 0 0
3 1 3 13 0 0 1 0 0
4 2 1 5 1 0 0 0 0
5 2 2 7 0 1 0 0 0
6 2 3 8 0 0 1 0 0
7 2 4 0 0 0 0 0 0
8 3 1 45 1 0 0 0 0
9 3 2 17 0 1 0 0 0
10 3 3 25 0 0 1 0 0
11 3 4 12 0 0 0 1 0
12 3 5 10 0 0 0 0 1
13 4 1 40 1 0 0 0 0
14 4 2 0 0 0 0 0 0
15 4 3 0 0 0 0 0 0
16 4 4 5 0 0 0 1 0
17 4 5 8 0 0 0 0 1
To clean up the names you can do:
names(df) <- sub("factor.t.", "It", names(df), fixed = TRUE)
You can use sapply to compare each t for equality against 1:5 and combine this with an & of y.val>0.
within(mydata, It <- +(sapply(1:5, `==`, t) & y.val>0))
# sub t y.val It.1 It.2 It.3 It.4 It.5
#1 1 1 10 1 0 0 0 0
#2 1 2 20 0 1 0 0 0
#3 1 3 13 0 0 1 0 0
#4 2 1 5 1 0 0 0 0
#5 2 2 7 0 1 0 0 0
#6 2 3 8 0 0 1 0 0
#7 2 4 0 0 0 0 0 0
#8 3 1 45 1 0 0 0 0
#9 3 2 17 0 1 0 0 0
#10 3 3 25 0 0 1 0 0
#11 3 4 12 0 0 0 1 0
#12 3 5 10 0 0 0 0 1
#13 4 1 40 1 0 0 0 0
#14 4 2 0 0 0 0 0 0
#15 4 3 0 0 0 0 0 0
#16 4 4 5 0 0 0 1 0
#17 4 5 8 0 0 0 0 1
Here's a tidyverse solution, using pivot_wider:
library(tidyverse)
mydata %>%
mutate(new_col = paste0("It", t),
y_test = as.integer(y.val > 0)) %>%
pivot_wider(id_cols = c(sub, t, y.val),
names_from = new_col,
values_from = y_test,
values_fill = list(y_test = 0))
sub t y.val It1 It2 It3 It4 It5
<int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 10 1 0 0 0 0
2 1 2 20 0 1 0 0 0
3 1 3 13 0 0 1 0 0
4 2 1 5 1 0 0 0 0
5 2 2 7 0 1 0 0 0
6 2 3 8 0 0 1 0 0
7 2 4 0 0 0 0 0 0
8 3 1 45 1 0 0 0 0
9 3 2 17 0 1 0 0 0
10 3 3 25 0 0 1 0 0
11 3 4 12 0 0 0 1 0
12 3 5 10 0 0 0 0 1
13 4 1 40 1 0 0 0 0
14 4 2 0 0 0 0 0 0
15 4 3 0 0 0 0 0 0
16 4 4 5 0 0 0 1 0
17 4 5 8 0 0 0 0 1
Explanation:
Make two columns, new_col (new column names with "It") and y_test (y.val > 0).
Pivot new_col values into column names.
Fill in the NA values with zeros.
One purrr and dplyr option could be:
map_dfc(.x = 1:5,
~ mydata %>%
mutate(!!paste0("It", .x) := as.integer(t == .x & y.val > 0)) %>%
select(starts_with("It"))) %>%
bind_cols(mydata)
It1 It2 It3 It4 It5 sub t y.val
1 1 0 0 0 0 1 1 10
2 0 1 0 0 0 1 2 20
3 0 0 1 0 0 1 3 13
4 1 0 0 0 0 2 1 5
5 0 1 0 0 0 2 2 7
6 0 0 1 0 0 2 3 8
7 0 0 0 0 0 2 4 0
8 1 0 0 0 0 3 1 45
9 0 1 0 0 0 3 2 17
10 0 0 1 0 0 3 3 25
11 0 0 0 1 0 3 4 12
12 0 0 0 0 1 3 5 10
13 1 0 0 0 0 4 1 40
14 0 0 0 0 0 4 2 0
15 0 0 0 0 0 4 3 0
16 0 0 0 1 0 4 4 5
17 0 0 0 0 1 4 5 8
Or if you want to perform it dynamically according the range in t column:
map_dfc(.x = reduce(as.list(range(mydata$t)), `:`),
~ mydata %>%
mutate(!!paste0("It", .x) := as.integer(t == .x & y.val > 0)) %>%
select(starts_with("It"))) %>%
bind_cols(mydata)
I have dummy coded the data in R using the package named "dummies". This gave me the output in various levels and create the dummies for each level. I want to consolidate all those levels on the basis of attributes. Please help me out! Thanks in advance. The following is the code I used:
#To read the data from the working directory
bankfull<-read.csv("bank.csv")
#Calling the library named dummies
library(dummies)
#Redefining the bankfull data with the dummy codes
bankfull<-dummy.data.frame(bankfull,sep=",")
#Viewing the data after dummy coding
print(bankfull)
The output is as follows:
#To read the data from the working directory
> bankfull<-read.csv("bank.csv")
> #Calling the library named dummies
> library(dummies)
dummies-1.5.6 provided by Decision Patterns
> #Redefining the bankfull data with the dummy codes
> bankfull<-dummy.data.frame(bankfull,sep=",")
> #Viewing the data after dummy coding
> View(bankfull)
> library(carData)
> #Viewing the data after dummy coding
> print(bankfull)
CHK_ACCT DURATION HISTORY NEW_CAR USED_CAR FURNITURE RADIO_TV EDUCATION RETRAINING AMOUNT SAV_ACCT EMPLOYMENT INSTALL_RATE MALE_DIV
1 0 6 4 0 0 0 1 0 0 1169 4 4 4 0
2 1 48 2 0 0 0 1 0 0 5951 0 2 2 0
3 3 12 4 0 0 0 0 1 0 2096 0 3 2 0
4 0 42 2 0 0 1 0 0 0 7882 0 3 2 0
5 0 24 3 1 0 0 0 0 0 4870 0 2 3 0
6 3 36 2 0 0 0 0 1 0 9055 4 2 2 0
7 3 24 2 0 0 1 0 0 0 2835 2 4 3 0
8 1 36 2 0 1 0 0 0 0 6948 0 2 2 0
9 3 12 2 0 0 0 1 0 0 3059 3 3 2 1
10 1 30 4 1 0 0 0 0 0 5234 0 0 4 0
11 1 12 2 1 0 0 0 0 0 1295 0 1 3 0
12 0 48 2 0 0 0 0 0 1 4308 0 1 3 0
13 1 12 2 0 0 0 1 0 0 1567 0 2 1 0
14 0 24 4 1 0 0 0 0 0 1199 0 4 4 0
15 0 15 2 1 0 0 0 0 0 1403 0 2 2 0
16 0 24 2 0 0 0 1 0 0 1282 1 2 4 0
17 3 24 4 0 0 0 1 0 0 2424 4 4 4 0
18 0 30 0 0 0 0 0 0 1 8072 4 1 2 0
19 1 24 2 0 1 0 0 0 0 12579 0 4 4 0
20 3 24 2 0 0 0 1 0 0 3430 2 4 3 0
21 3 9 4 1 0 0 0 0 0 2134 0 2 4 0
22 0 6 2 0 0 0 1 0 0 2647 2 2 2 0
23 0 10 4 1 0 0 0 0 0 2241 0 1 1 0
24 1 12 4 0 1 0 0 0 0 1804 1 1 3 0
25 3 10 4 0 0 1 0 0 0 2069 4 2 2 0
26 0 6 2 0 0 1 0 0 0 1374 0 2 1 0
27 3 6 0 0 0 0 1 0 0 426 0 4 4 0
28 2 12 1 0 0 0 1 0 0 409 3 2 3 0
29 1 7 2 0 0 0 1 0 0 2415 0 2 3 0
30 0 60 3 0 0 0 0 0 1 6836 0 4 3 0
31 1 18 2 0 0 0 0 0 1 1913 3 1 3 0
32 0 24 2 0 0 1 0 0 0 4020 0 2 2 0
MALE_SINGLE MALE_MAR_or_WID CO_APPLICANT GUARANTOR PRESENT_RESIDENT REAL_ESTATE PROP_UNKN_NONE AGE OTHER_INSTALL RENT OWN_RES NUM_CREDITS
1 1 0 0 0 4 1 0 67 0 0 1 2
2 0 0 0 0 2 1 0 22 0 0 1 1
3 1 0 0 0 3 1 0 49 0 0 1 1
4 1 0 0 1 4 0 0 45 0 0 0 1
5 1 0 0 0 4 0 1 53 0 0 0 2
6 1 0 0 0 4 0 1 35 0 0 0 1
7 1 0 0 0 4 0 0 53 0 0 1 1
8 1 0 0 0 2 0 0 35 0 1 0 1
9 0 0 0 0 4 1 0 61 0 0 1 1
10 0 1 0 0 2 0 0 28 0 0 1 2
11 0 0 0 0 1 0 0 25 0 1 0 1
12 0 0 0 0 4 0 0 24 0 1 0 1
13 0 0 0 0 1 0 0 22 0 0 1 1
14 1 0 0 0 4 0 0 60 0 0 1 2
15 0 0 0 0 4 0 0 28 0 1 0 1
16 0 0 0 0 2 0 0 32 0 0 1 1
17 1 0 0 0 4 0 0 53 0 0 1 2
18 1 0 0 0 3 0 0 25 1 0 1 3
19 0 0 0 0 2 0 1 44 0 0 0 1
20 1 0 0 0 2 0 0 31 0 0 1 1
21 1 0 0 0 4 0 0 48 0 0 1 3
22 1 0 0 0 3 1 0 44 0 1 0 1
23 1 0 0 0 3 1 0 48 0 1 0 2
24 1 0 0 0 4 0 0 44 0 0 1 1
25 0 1 0 0 1 0 0 26 0 0 1 2
26 1 0 0 0 2 1 0 36 1 0 1 1
27 0 1 0 0 4 0 0 39 0 0 1 1
28 0 0 0 0 3 1 0 42 0 1 0 2
29 1 0 0 1 2 1 0 34 0 0 1 1
30 1 0 0 0 4 0 1 63 0 0 1 2
31 0 1 0 0 3 1 0 36 1 0 1 1
32 1 0 0 0 2 0 0 27 1 0 1 1
JOB NUM_DEPENDENTS TELEPHONE FOREIGN RESPONSE
1 2 1 1 0 1
2 2 1 0 0 0
3 1 2 0 0 1
4 2 2 0 0 1
5 2 2 0 0 0
6 1 2 1 0 1
7 2 1 0 0 1
8 3 1 1 0 1
9 1 1 0 0 1
10 3 1 0 0 0
11 2 1 0 0 0
12 2 1 0 0 0
13 2 1 1 0 1
14 1 1 0 0 0
15 2 1 0 0 1
16 1 1 0 0 0
17 2 1 0 0 1
18 2 1 0 0 1
19 3 1 1 0 0
20 2 2 1 0 1
21 2 1 1 0 1
22 2 2 0 0 1
23 1 2 0 1 1
24 2 1 0 0 1
25 2 1 0 1 1
26 1 1 1 0 1
27 1 1 0 0 1
28 2 1 0 0 1
29 2 1 0 0 1
30 2 1 1 0 0
31 2 1 1 0 1
32 2 1 0 0 1
[ reached getOption("max.print") -- omitted 968 rows ]
In the output the dummies are given according to the levels. I need the output according to the attributes. Can I merge all of them without using any third party packages?
I hava a text file that i want to use for survival data analysis:
1 0 0 0 15 0 0 1 1 0 0 2 12 0 12 0 12 0
2 0 0 1 20 0 0 1 0 0 0 4 9 0 9 0 9 0
3 0 0 1 15 0 0 0 1 1 0 2 13 0 13 0 7 1
4 0 0 0 20 1 0 1 0 0 0 2 11 1 29 0 29 0
5 0 0 1 70 1 1 1 1 0 0 2 28 1 31 0 4 1
6 0 0 1 20 1 0 1 0 0 0 4 11 0 11 0 8 1
7 0 0 1 5 0 0 0 0 0 1 4 12 0 12 0 11 1
8 0 0 1 30 1 0 1 1 0 0 4 8 1 34 0 4 1
9 0 0 1 25 0 1 0 1 1 0 4 10 1 53 0 4 1
10 0 0 1 20 0 1 0 1 0 0 4 7 0 1 1 7 0
11 0 0 1 30 1 0 1 0 0 1 4 7 1 21 1 44 1
12 0 0 0 20 0 0 1 0 0 1 4 20 0 1 1 20 0
13 0 0 1 25 0 0 1 1 1 0 4 12 1 32 0 32 0
14 0 0 1 70 0 0 0 0 0 1 4 16 0 16 0 16 0
15 0 0 1 20 1 0 1 0 0 0 4 39 0 39 0 39 0
16 0 0 0 10 1 0 1 0 0 1 4 23 1 34 0 34 0
17 0 0 1 10 1 0 0 0 0 0 4 8 0 8 0 8 0
18 0 0 1 15 0 0 0 0 0 0 4 15 0 15 0 6 1
19 0 0 1 10 0 0 0 0 0 1 4 8 0 8 0 8 0
20 0 0 1 15 0 0 0 0 1 0 4 24 1 32 0 32 0
21 0 0 1 16 0 0 1 0 0 0 4 25 1 22 1 43 0
22 0 1 1 55 1 0 1 1 0 0 4 14 1 3 1 56 0
23 0 0 1 20 1 0 1 1 0 0 4 24 1 47 0 11 1
24 0 0 0 30 0 0 0 1 1 0 4 6 1 43 0 43 0
25 0 0 1 40 0 1 0 1 1 0 1 25 0 3 1 25 0
26 0 0 1 15 1 0 1 1 0 0 4 12 0 12 0 12 0
27 0 1 1 50 0 0 1 0 0 1 4 15 1 53 0 32 1
28 0 0 1 40 1 0 1 1 0 0 4 18 1 52 0 51 1
29 0 1 1 45 0 1 1 1 1 0 4 13 1 11 1 21 0
30 0 1 0 40 0 1 1 1 1 0 2 29 0 2 1 29 0
31 0 0 1 28 0 0 1 0 0 0 2 7 0 7 0 3 1
32 0 0 1 19 1 0 1 0 0 0 3 16 0 16 0 16 0
33 0 0 1 15 0 0 1 0 0 0 2 10 0 10 0 3 1
34 0 0 1 5 0 0 1 0 1 0 3 6 0 6 0 4 1
35 0 1 1 35 0 0 1 0 0 0 4 8 1 43 0 7 1
36 0 0 1 2 1 0 1 0 0 0 1 1 1 27 0 27 0
37 0 1 1 5 0 0 1 0 0 0 2 18 0 18 0 18 0
38 0 0 1 55 1 0 1 0 0 1 4 6 1 5 1 47 1
39 0 0 0 10 0 0 0 1 0 0 2 19 1 29 0 29 0
40 0 0 1 15 0 0 1 0 0 0 4 5 0 5 0 5 0
41 0 1 1 20 1 0 1 0 0 1 4 1 1 4 1 97 0
42 0 1 0 30 1 0 1 1 0 1 4 15 1 28 0 28 0
43 0 0 1 25 1 1 1 1 0 1 4 14 1 4 1 7 1
44 0 0 1 95 1 1 1 1 1 1 4 9 0 9 0 3 1
45 0 1 1 30 0 0 0 0 1 0 4 1 1 39 0 39 0
46 0 0 1 15 1 0 1 0 0 0 4 10 0 10 0 10 0
47 0 0 1 20 0 1 1 1 0 0 4 6 1 5 1 46 0
48 0 1 1 6 0 0 1 0 0 0 2 13 1 28 0 28 0
49 0 0 1 15 0 0 1 0 0 1 4 11 1 21 0 21 0
50 0 0 1 7 0 0 1 1 0 0 1 8 1 17 1 38 0
51 0 0 1 13 0 0 1 1 1 0 4 10 0 10 0 10 0
52 0 0 1 25 1 0 1 0 0 1 4 6 1 40 0 5 1
53 0 0 1 25 1 0 1 0 1 1 4 18 1 22 0 9 1
54 0 1 1 20 1 0 1 0 0 1 4 16 1 16 1 21 1
55 0 1 1 25 0 0 1 1 0 0 4 7 1 26 0 26 0
56 0 0 1 95 1 0 1 1 1 1 4 14 0 14 0 14 0
57 0 0 1 17 1 0 1 0 0 0 4 16 0 16 0 16 0
58 0 0 1 3 0 0 1 0 1 0 3 4 0 4 0 1 1
59 0 0 1 15 1 0 1 0 0 0 4 19 0 6 1 19 0
60 0 0 1 65 1 1 1 1 1 1 4 21 1 8 1 10 1
61 0 1 1 15 1 0 1 1 1 1 4 18 0 18 0 18 0
62 0 0 1 40 1 0 1 0 0 0 3 31 0 31 0 13 1
63 0 0 1 45 1 0 1 1 0 1 4 11 1 24 1 40 0
64 0 1 0 35 0 0 1 1 0 0 4 4 1 5 1 47 0
65 0 0 1 85 1 1 1 1 0 1 4 12 1 8 1 9 1
66 0 1 1 15 0 1 0 1 0 1 4 11 1 35 0 19 1
67 0 0 1 70 0 1 1 1 1 0 2 23 1 8 1 60 0
68 0 0 1 6 1 0 0 0 0 1 4 7 0 7 0 7 0
69 0 0 1 20 0 0 1 0 0 0 4 19 1 26 0 6 1
70 0 1 1 36 1 0 1 0 1 1 4 16 1 20 1 23 1
71 1 1 1 50 1 1 1 0 1 0 4 15 0 1 1 15 0
72 1 0 1 21 1 0 1 0 0 0 4 6 1 13 1 23 0
73 1 0 1 16 1 0 1 0 0 0 4 2 1 9 0 9 0
74 1 1 1 3 0 0 1 0 0 0 4 6 1 14 0 14 0
75 1 0 1 5 1 0 1 0 0 0 3 8 0 8 0 2 1
76 1 0 1 32 0 1 1 1 0 1 4 18 1 51 0 18 1
77 1 0 1 38 0 1 1 1 0 0 4 12 1 22 0 22 0
78 1 0 1 16 1 0 1 0 0 0 4 7 1 16 0 16 0
79 1 1 1 9 0 1 0 1 0 0 4 6 1 2 1 2 1
80 1 0 1 17 0 1 1 0 0 0 2 10 1 10 1 22 0
81 1 0 1 22 1 0 1 0 0 0 4 12 1 20 0 5 1
82 1 0 1 10 0 0 1 0 0 0 4 5 1 5 1 14 0
83 1 0 1 12 1 0 1 0 0 0 4 12 0 12 0 12 0
84 1 0 1 80 1 1 1 1 1 1 4 6 1 4 1 41 0
85 1 1 1 15 0 0 1 1 0 0 4 9 1 9 1 21 0
86 1 0 1 50 1 0 1 0 0 1 4 18 1 7 1 56 0
87 1 0 1 50 1 1 1 1 1 1 4 7 1 42 1 67 0
88 1 0 1 15 1 0 1 0 0 0 3 11 0 11 0 11 0
89 1 0 1 8 1 0 1 0 0 0 4 9 1 17 0 17 0
90 1 1 1 45 1 1 1 1 0 0 1 11 1 11 1 18 1
91 1 0 1 20 0 1 1 1 0 1 4 6 1 6 1 14 1
92 1 0 1 5 0 0 1 0 1 0 3 4 1 8 0 5 1
93 1 0 1 25 0 0 1 0 0 0 2 5 1 10 0 5 1
94 1 0 1 40 0 1 1 1 0 0 4 11 1 8 1 31 0
95 1 0 1 4 0 0 1 0 1 0 3 9 1 7 1 23 0
96 1 0 1 25 0 0 1 1 0 1 4 4 1 14 1 46 0
97 1 1 1 20 0 0 1 0 1 0 4 5 1 1 1 38 0
98 1 1 1 26 0 0 1 0 0 1 4 8 1 3 1 35 0
99 1 0 1 10 0 1 1 1 0 0 4 13 1 21 0 21 0
100 1 1 1 85 1 1 1 1 0 1 4 11 0 3 1 11 0
101 1 0 1 75 1 0 1 1 1 0 4 29 1 49 0 16 1
102 1 0 0 5 0 0 1 0 1 0 1 13 0 13 0 13 0
103 1 0 1 20 1 0 1 0 0 0 4 1 1 12 0 12 0
104 1 1 1 8 0 1 0 1 1 0 4 6 1 6 1 13 0
105 1 1 1 10 0 0 1 0 0 1 4 6 1 23 0 23 0
106 1 0 1 10 0 0 0 0 1 1 4 3 1 31 0 31 0
107 1 1 0 2 0 0 1 0 0 0 1 2 1 2 1 10 0
108 1 0 0 5 0 0 0 0 1 0 2 4 1 4 1 17 0
109 1 0 1 10 1 0 0 0 1 0 4 5 1 18 0 18 0
110 1 0 1 18 0 0 1 1 1 0 4 6 1 5 1 33 0
111 1 0 1 20 1 0 1 1 0 0 4 9 1 8 1 17 0
112 1 0 1 80 1 1 1 1 1 1 4 4 1 11 1 13 0
113 1 0 0 17 1 0 1 1 1 1 4 5 1 4 1 35 0
114 1 0 0 35 1 0 1 0 0 0 4 7 1 7 1 71 0
115 1 0 1 50 1 0 1 0 1 1 4 11 0 11 0 3 1
116 1 0 0 20 0 0 1 0 0 0 4 6 1 31 1 42 1
117 1 0 1 25 0 1 1 1 0 0 3 8 0 8 0 5 1
118 1 0 1 20 0 0 0 1 0 1 1 3 1 2 1 30 0
119 1 0 1 20 0 0 1 1 0 0 4 6 1 38 0 38 0
120 1 0 1 10 1 0 1 0 0 0 4 16 0 16 0 16 0
121 1 0 0 15 1 0 1 0 0 0 2 20 0 20 0 20 0
122 1 0 1 15 0 0 1 0 1 0 4 30 0 2 1 30 0
123 1 0 1 15 0 0 1 0 0 0 4 2 1 7 0 7 0
124 1 0 1 20 0 0 1 1 0 0 2 8 1 6 1 22 0
125 1 0 1 13 1 0 1 0 0 0 4 13 0 4 1 5 1
126 1 0 1 25 1 0 1 0 0 1 4 13 1 1 1 31 0
127 1 0 1 25 0 0 1 1 0 1 4 17 0 17 0 10 1
128 1 0 1 8 1 0 1 0 0 0 4 14 0 14 0 14 0
129 1 1 1 30 1 0 1 0 0 1 4 13 0 5 1 13 0
130 1 0 1 40 0 1 1 1 1 0 4 24 0 7 1 17 1
131 1 1 1 12 0 1 1 1 1 0 1 14 1 21 0 21 0
132 1 0 1 15 0 0 1 0 0 0 4 8 1 19 1 25 0
133 1 0 1 25 1 0 1 0 0 0 4 23 0 23 0 8 1
134 1 0 1 15 0 0 1 0 0 0 4 17 1 17 0 11 1
135 1 0 0 20 0 0 1 1 1 0 4 19 1 31 0 31 0
136 1 0 1 22 0 1 1 0 0 0 4 14 1 20 0 20 0
137 1 0 1 15 1 0 1 0 1 0 4 15 1 22 0 22 0
138 1 0 1 7 1 0 1 0 0 0 3 13 0 3 1 13 0
139 1 0 1 30 0 1 1 1 1 0 2 49 0 49 0 4 1
140 1 0 1 20 1 0 1 0 0 1 4 14 0 10 1 14 0
141 1 1 1 35 1 0 1 0 0 1 4 6 1 5 1 49 0
142 1 0 0 10 0 0 1 0 0 0 4 12 0 12 0 12 0
143 1 0 1 8 0 0 1 0 1 0 3 14 0 1 1 14 0
144 1 0 1 13 0 0 0 0 1 0 4 32 1 38 0 38 0
145 1 1 0 10 0 1 1 1 0 0 2 12 1 13 1 41 0
146 1 0 1 8 0 0 0 1 1 0 4 10 1 18 0 18 0
147 1 0 1 7 1 0 1 0 0 0 4 8 0 8 0 8 0
148 1 0 1 52 1 0 1 1 1 1 4 15 1 39 1 76 0
149 1 1 1 14 0 1 1 1 1 0 4 8 1 62 0 62 0
150 1 1 1 7 0 0 1 0 0 0 1 5 1 17 0 17 0
151 1 1 1 20 1 0 1 0 0 0 4 7 1 6 1 17 1
152 1 0 1 15 0 0 0 1 1 1 4 19 1 3 1 42 0
153 1 0 1 10 0 0 1 0 0 0 4 10 0 10 0 2 1
154 1 0 1 35 1 1 1 0 0 0 4 10 1 27 0 27 0
I have used the Import Dataset tool within R, but I cannot seem to find the right setting to import the dataset. The columns are either merged together, or there are additional columns (with many) NAs.
I have looked around for similar questions, however I cannot find a solution that suits my problem.
How can I import this dataset?
Ensure it is saved as a text file (for example text.txt) then apply the following: read.table("text.txt").
I am working with a large dataset and I need to add rows to it. This issue was raised in another question but I am separating this particular issue from other questions in the original question. I'm fairly new to SO so please let me know if this is not a 'done' thing.
Data example:
yr week id days rev p1 p2 p3 f1 f2 f3 f4
2016 3 1 1 5568.3 0 1 0 0 0 0 0
2016 4 1 3 8869.53 0 1 0 0 0 0 0
2016 5 1 2 12025.8 0 1 0 0 0 0 0
2016 6 1 2 9126.6 0 1 0 0 0 0 0
2016 7 1 3 4415.4 0 1 0 0 0 0 0
2016 8 1 2 11586.6 0 1 0 0 0 0 0
2016 10 1 1 2144.4 0 1 0 0 0 0 0
2016 11 1 1 2183.25 0 1 0 0 0 0 0
2016 14 1 2 4998 0 1 0 0 0 0 0
2016 15 1 3 117 0 1 0 0 0 0 0
2016 1 2 4 12743.3 0 0 1 1 1 0 0
2016 2 2 2 7473.48 0 0 1 1 1 0 0
2016 5 2 2 8885.52 0 0 1 1 1 0 0
2016 7 2 1 15330.6 0 0 1 1 1 0 0
2016 8 2 2 3763.8 0 0 1 1 1 0 0
2016 9 2 1 2274.05 0 0 1 1 1 0 0
For each combination of id and yr there are several rows of data corresponding to week. The p1:p3 and f1:f4 columns are id/yr invariant, rev varies with week.
For each combination of yr/id there is a maximum value for week. What I want to do is to add missing rows starting from one for week=1, up to the maximum value for that yr/id combination.
I would like to end up with:
yr week id days rev p1 p2 p3 f1 f2 f3 f4
2016 1 1 0 NA 0 1 0 0 0 0 0
2016 2 1 0 NA 0 1 0 0 0 0 0
2016 3 1 1 5568.3 0 1 0 0 0 0 0
2016 4 1 3 8869.53 0 1 0 0 0 0 0
2016 5 1 2 12025.8 0 1 0 0 0 0 0
2016 6 1 2 9126.6 0 1 0 0 0 0 0
2016 7 1 3 4415.4 0 1 0 0 0 0 0
2016 8 1 2 11586.6 0 1 0 0 0 0 0
2016 9 1 0 NA 0 1 0 0 0 0 0
2016 10 1 1 2144.4 0 1 0 0 0 0 0
2016 11 1 1 2183.25 0 1 0 0 0 0 0
2016 12 1 0 NA 0 1 0 0 0 0 0
2016 13 1 0 NA 0 1 0 0 0 0 0
2016 14 1 2 4998 0 1 0 0 0 0 0
2016 15 1 3 117 0 1 0 0 0 0 0
2016 1 2 4 12743.3 0 0 1 1 1 0 0
2016 2 2 2 7473.48 0 0 1 1 1 0 0
2016 3 2 0 NA 0 0 1 1 1 0 0
2016 4 2 0 NA 0 0 1 1 1 0 0
2016 5 2 2 8885.52 0 0 1 1 1 0 0
2016 6 2 0 NA 0 0 1 1 1 0 0
2016 7 2 1 15330.6 0 0 1 1 1 0 0
2016 8 2 2 3763.8 0 0 1 1 1 0 0
2016 9 2 1 2274.05 0 0 1 1 1 0 0
I have tried using CJ from the data.table package but the issue is that the join is different for each id/season group. Any suggestions are appreciated.
Grouping with dplyr (v0.4.3) and the complete function from tidyr (v0.4.1) should do the trick:
library(dplyr)
library(tidyr)
df %>%
group_by(yr, id) %>%
complete(week = 1:max(week)) %>%
replace_na(list(days = 0)) %>%
group_by(yr, id) %>%
mutate_each(funs(replace(., is.na(.), mean(., na.rm = T))), p1:f4)
I have a problem with setting missing values in data frame. In the first 3 columns there are ID of product, ID of store, and number of week. There are also 28 columns from 4 to 31 corresponding to last 28 days of selling item (last 7 days are days in our week). I want to set the missing values by comparing two records with the same first and second column but different number of weeks.
corrections <- function(x,y){
#the functions changes vector y if the difference between weeks is not greeter than 3
if (x[1]==y[1] && x[2]==y[2] && -(x[3]-y[3])<=3){
t=y[3]-x[3]
t=as.integer(t)
a=x[(4+ (t*7) ):31]
b=y[4:(31- (t*7)) ]
c= a-b
for (i in 1:(28-(t*7))){
if (is.na(c[i]))
{
if (!(is.na(a[i]) && is.na(b[i])))
{
if (is.na(b[i]))
b[i]=a[i]
else
a[i]=b[i]
}
}
}
y[4:(31- t*7)]=b
}
return(y)
}
for (i in 2:(dim(salesTraining)[1]) {
salesTraining[i,]=corrections(salesTraining[i-1,], salesTraining[i,])
}
The loop takes 1 minute for every 1000 records so if my data have 212000 records it will take ~3,5 hours (if it's linear complexity). Is there any error or can I do it better - faster?
Example of data frame:
productID storeID weekInData dailySales1 dailySales2 dailySales3 dailySales4 dailySales5
1 1 1 37 0 0 0 0 0
2 1 1 38 0 0 0 0 0
3 1 1 39 0 0 0 0 0
4 1 1 40 0 NA 0 NA 2
5 1 1 41 NA 0 NA 0 0
6 1 1 42 0 0 0 NA 0
7 1 1 43 0 0 NA 0 NA
8 1 1 44 0 2 1 NA 0
9 1 1 45 NA 0 0 NA 0
10 1 1 46 NA 0 0 NA NA
dailySales6 dailySales7 dailySales8 dailySales9 dailySales10 dailySales11 dailySales12 dailySales13
1 NA NA 0 NA 0 0 0 0
2 0 NA NA 0 0 0 0 0
3 0 NA 0 0 0 NA 2 NA
4 0 NA 0 NA 0 NA 0 0
5 0 0 NA 0 0 0 0 0
6 NA 0 NA 0 0 0 0 0
7 0 0 0 2 NA 0 0 0
8 0 NA 0 NA 0 NA 0 1
9 1 0 0 0 0 0 1 0
10 0 0 0 NA 0 NA 0 0
dailySales14 dailySales15 dailySales16 dailySales17 dailySales18 dailySales19 dailySales20
1 0 0 0 0 0 0 0
2 0 0 0 0 5 2 NA
3 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0
6 0 0 2 1 0 0 NA
7 0 0 0 0 0 0 1
8 0 0 0 0 0 1 0
9 0 0 -1 0 0 0 0
10 0 0 0 0 0 0 0
dailySales21 dailySales22 dailySales23 dailySales24 dailySales25 dailySales26 dailySales27
1 NA 0 0 0 5 2 0
2 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0
5 0 0 NA 1 0 0 0
6 0 0 0 0 0 0 1
7 0 0 0 0 0 1 0
8 0 0 NA 0 0 0 0
9 NA 0 0 0 NA 0 0
10 0 1 0 0 0 0 0
dailySales28 daysStoreClosed_series daysStoreClosed_target dayOfMonth dayOfYear weekOfYear month
1 0 5 2 23 356 51 12
2 0 6 2 30 363 52 12
3 0 6 1 6 5 1 1
4 0 6 1 13 12 2 1
5 0 6 1 19 18 3 1
6 0 5 1 26 25 4 1
7 0 4 1 2 32 5 2
8 0 4 1 9 39 6 2
9 0 4 1 16 46 7 2
10 0 4 1 23 53 8 2
quarter
1 4
2 4
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1