I try to split dataframe by 50% by class. However, I do not want to split fields with the same OID (object identifier). I would like the fields with the same OID to be in the same set.
#Data frame:
"b1""b2""b3""CLASS" "OID"
110 134 119 "tree" 1
112 133 118 "tree" 1
105 125 110 "tree" 2
112 132 117 "tree" 2
109 125 115 "meadow" 6
93 110 101 "meadow" 6
86 106 95 "meadow" 7
105 136 116 "meadow" 7
102 128 111 "meadow" 8
108 129 115 "meadow" 8
113 134 119 "meadow" 8
Expected data:
#Expected:
"b1""b2""b3""CLASS" "OID"
110 134 119 "tree" 1
112 133 118 "tree" 1
109 125 115 "meadow" 6
93 110 101 "meadow" 6
86 106 95 "meadow" 7
105 136 116 "meadow" 7
This selects the top half of rows in each group, plus any rows which have the same OID as the rows in that top half.
library(dplyr)
df %>%
group_by(CLASS) %>%
filter(OID %in% head(OID, n() %/% 2)) %>%
ungroup
# # A tibble: 6 x 5
# b1 b2 b3 CLASS OID
# <int> <int> <int> <chr> <int>
# 1 110 134 119 tree 1
# 2 112 133 118 tree 1
# 3 109 125 115 meadow 6
# 4 93 110 101 meadow 6
# 5 86 106 95 meadow 7
# 6 105 136 116 meadow 7
If your real data is arranged by OID like this example, you could also use top_frac
df %>%
group_by(CLASS) %>%
top_frac(.5, -OID)
# # A tibble: 6 x 5
# b1 b2 b3 CLASS OID
# <int> <int> <int> <chr> <int>
# 1 110 134 119 tree 1
# 2 112 133 118 tree 1
# 3 109 125 115 meadow 6
# 4 93 110 101 meadow 6
# 5 86 106 95 meadow 7
# 6 105 136 116 meadow 7
Your data:
df = structure(list(b1 = c(110L, 112L, 105L, 112L, 109L, 93L, 86L,
105L, 102L, 108L, 113L), b2 = c(134L, 133L, 125L, 132L, 125L,
110L, 106L, 136L, 128L, 129L, 134L), b3 = c(119L, 118L, 110L,
117L, 115L, 101L, 95L, 116L, 111L, 115L, 119L), CLASS = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("meadow",
"tree"), class = "factor"), OID = c(1L, 1L, 2L, 2L, 6L, 6L, 7L,
7L, 8L, 8L, 8L)), class = "data.frame", row.names = c(NA, -11L
))
First create a function to take 1/2 according to OID
func = function(x){
x[x$OID %in% x$OID[1:round(nrow(x)/2)],]
}
We randomize the way the OID are sorted
df$OID = factor(df$OID,levels=sample(unique(df$OID)))
df = df[order(df$OID),]
do.call(rbind,by(df,df$CLASS,func))
This will ensure you get random ~ 50% everytime, with complete OID
Related
I was wondering how I could go about trying to take outliers from Boxplot$out (returns the outliers within the data) and put them into a table which shows the class they belong to e.g. if outlier is from class "Van", "Bus, "Saab" etc..
I have tried using which() function but this returns only the index of the outlier and not the class. I am not sure how to go about putting this into a table.
Any help would be greatly appreciated!
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
remOutliers <- sapply(data, function(x) x[!x %in% OutVals])
return (remOutliers)
}
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
vehClass <- vehData$Class
boxplot(vehData)
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
removeOutliers2 <- removeOutliers(removeOutliers1)
This can be simplified. Starting with your data frame vehData. First get the rownumbers of the outliers. In my comment I accidentally left out the seq() function:
vehDataRemove <- vehData[, -c(1, 20)]
OutVals <- boxplot(vehDataRemove)
idx <- sapply(seq(length(OutVals$out)), function(x) which(vehDataRemove[, OutVals$group[x]] == OutVals$out[x]))
idx
# [1] 5 5 6 5 3
Notice that three outliers are in the 5th row. Now remove the rows with outliers:
NoOuts <- vehDataRemove[-unique(idx), ]
NoOuts
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
So you have lost half of your data! Alternatively set the outliers to missing values:
Outs2NA <- vehDataRemove
Outs2NA[cbind(idx, OutVals$group)] <- NA
Outs2NA
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 3 104 50 106 209 66 10 207 32 23 158 223 635 220 73 NA 9 188 196
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
# 5 85 44 70 205 NA NA 149 45 19 144 241 325 188 NA 9 11 180 183
# 6 107 57 106 172 50 NA 255 26 28 169 280 957 264 85 5 9 181 183
structure(list(Date = c("KW 52 / 2016", "KW 1 / 2017", "KW 2 / 2017",
"KW 3 / 2017"), Sales_AT = c(150L, 169L, 143L, 170L), Sales_CH = c(150L,
169L, 143L, 170L), Sales_GER = c(150L, 169L, 143L, 170L), Sales_HUN = c(134L,
139L, NA, 125L), Sales_JP = c(134L, NA, 142L, 125L), Sales_POL = c(127L,
175L, 150L, 141L), Sales_SWE = c(125L, NA, 159L, 131L), Sales_USA = c(169L,
159L, NA, 132L), difference_AT = c(NA, 19L, -26L, 27L), difference_CH = c(NA,
19L, -26L, 27L), difference_GER = c(NA, 19L, -26L, 27L), difference_HUN = c(NA,
5L, NA, -14L), difference_JP = c(NA, NA, 8L, -17L), difference_POL = c(NA,
48L, -25L, -9L), difference_SWE = c(NA, NA, 34L, -28L), difference_USA = c(NA,
-10L, NA, -27L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-4L))
This is my dataset which looks like this:
A tibble: 4 x 17
Date Sales_AT Sales_CH Sales_GER Sales_HUN Sales_JP Sales_POL Sales_SWE Sales_USA difference_AT difference_CH difference_GER difference_HUN difference_JP difference_POL difference_SWE difference_USA
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 KW 52 / 2016 150 150 150 134 134 127 125 169 NA NA NA NA NA NA NA NA
2 KW 1 / 2017 169 169 169 139 NA 175 NA 159 19 19 19 5 NA 48 NA -10
3 KW 2 / 2017 143 143 143 NA 142 150 159 NA -26 -26 -26 NA 8 -25 34 NA
4 KW 3 / 2017 170 170 170 125 125 141 131 132 27 27 27 -14 -17 -9 -28 -27
I want to reorder the dataset to have the sales and difference column of each country next to each other.
I´m look for a dplyr solution which works like this, but in a dynamic way:
wide_result %>%
select(contains("AT"), contains("CH"), contains("HUN"), contains("JP"), contains("USA"))
Can anyone help me?
Using base R:
df[c(1, order(sub(".*_", "", names(df)[-1])) + 1)]
Here's a way we can do it. Basically, we put the names of the data into a tibble, extract the part of the name after the _ (when possible), and then sort by that extracted text.
names_sort <- tibble(nn = names(dat)) %>%
filter(nn != "Date") %>% # remove Date column, since we'll select that first
# replace everything before and up to _ with ""
mutate(names_fix = gsub(".*_", "", nn)) %>%
arrange(names_fix) %>%
pull(nn)
dat %>%
select(Date, names_sort)
# Date Sales_AT difference_AT Sales_CH difference_CH
# <chr> <int> <int> <int> <int>
# 1 KW 52 / 2016 150 NA 150 NA
# 2 KW 1 / 2017 169 19 169 19
# 3 KW 2 / 2017 143 -26 143 -26
# 4 KW 3 / 2017 170 27 170 27
You can use dplyr select_at:
vars <- c("CH", "AT")
df %>%
select_at(vars(one_of("Date",
paste0("Sales_", vars),
paste0("difference_", vars))))
# A tibble: 4 x 5
Date Sales_CH Sales_AT difference_CH difference_AT
<chr> <int> <int> <int> <int>
1 KW 52 / 2016 150 150 NA NA
2 KW 1 / 2017 169 169 19 19
3 KW 2 / 2017 143 143 -26 -26
4 KW 3 / 2017 170 170 27 27
I have a data frame with 3 ID variables, then several values for each ID.
user Log Pass Value
2 2 123 342
2 2 123 543
2 2 123 231
2 2 124 257
2 2 124 342
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 342
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
The start and end of each set of values is sometimes noisy, and I want to be able to delete the first few values. Unfortunately the number of values varies significantly, but it is always the first and last 20% of values that are noisy.
I want to delete the first 20% of rows, with a minimum of 1 row deleted.
So for instance if there are 20 values for user 2 log 2 pass 123 I want to delete the first and last 4 rows. If there are only 3 values for the ID variable I want to delete the first and last row.
The resulting dataset would be:
user Log Pass Value
2 2 123 543
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
4 3 125 257
4 3 125 543
4 3 125 231
I've tried fiddling around with nrow but I struggle to figure out how to reference the % of rows by id variable.
Thanks.
Jonathan.
I believe the following can do it.
DATA.
dat <-
structure(list(user = c(2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), Log = c(2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), Pass = c(123L, 123L, 123L, 124L, 124L, 125L, 125L,
125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L, 125L,
125L, 125L, 125L), Value = c(342L, 543L, 231L, 257L, 342L, 543L,
231L, 257L, 342L, 543L, 231L, 257L, 543L, 231L, 257L, 543L, 231L,
257L, 543L, 231L, 257L)), .Names = c("user", "Log", "Pass", "Value"
), class = "data.frame", row.names = c(NA, -21L))
CODE.
fun <- function(x, p = 0.20){
n <- nrow(x)
m <- max(1, round(n*p))
inx <- c(seq_len(m), n - seq_len(m) + 1)
x[-inx, ]
}
result <- do.call(rbind, lapply(split(dat, dat$user), fun))
row.names(result) <- NULL
result
# user Log Pass Value
#1 2 2 123 543
#2 2 2 123 231
#3 2 2 124 257
#4 4 3 125 342
#5 4 3 125 543
#6 4 3 125 231
#7 4 3 125 257
#8 4 3 125 543
#9 4 3 125 231
#10 4 3 125 257
#11 4 3 125 543
#12 4 3 125 231
#13 4 3 125 257
Would something like this help?
For a dataframe df:
df[-c(1:floor(nrow(df)*0.2), (1+ceiling(nrow(df)*0.8)):nrow(df)),]
Just removing the first and last 20%, taking the upper and lower values so that for smaller data frame you keep some of the information:
> df<-data.frame(a=1:100)
> df[-c(1:floor(nrow(df)*0.2),(1+ceiling(nrow(df)*0.8)):nrow(df)),]
[1] 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
[31] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
> df<-data.frame(1:3)
> df[-c(1:floor(nrow(df)*0.2),(1+ceiling(nrow(df)*0.8)):nrow(df)),]
[1] 2
You can do this with dplyr...
library(dplyr)
df2 <- df %>% group_by(user, Log, Pass) %>%
filter(n()>2) %>% #remove those with just two elements or fewer
slice(max(2, 1+ceiling(n()*0.2)):min(n()-1, floor(0.8*n())))
df2
user Log Pass Value
1 2 2 123 543
2 4 3 125 543
3 4 3 125 231
4 4 3 125 257
5 4 3 125 543
6 4 3 125 231
7 4 3 125 257
8 4 3 125 543
9 4 3 125 231
Calculate the offset for what you want to retain:
rem <- ceiling( nrow( x ) * .2 ) + 1
Then take out the records you don-t want:
dat <- dat[ rem : ( nrow( dat ) - rem ), ]
Here is an idea using base R that returns the row indices of each user to keep and then subsets on these indices.
idx <- unlist(lapply(split(seq_along(dat[["user"]]), dat[["user"]]), function(x) {
tmp <- max(1, ceiling(.2 * length(x)))
tail(head(x, -tmp), -tmp)}),
use.names=FALSE)
split(seq_along(dat[["user"]]), dat[["user"]]) returns a list of the rows for each user. lapply loops through these rows, calculating the number of rows to drop from each end with split(seq_along(dat[["user"]]), dat[["user"]]), and then dropping them with tail(head(x, -tmp), -tmp)}). Since lapply returns a named list, this is unlisted and the names are dropped.
This returns
idx
2 3 4 10 11 12 13 14 15 16 17
Now subset
dat[idx,]
user Log Pass Value
2 2 2 123 543
3 2 2 123 231
4 2 2 124 257
10 4 3 125 543
11 4 3 125 231
12 4 3 125 257
13 4 3 125 543
14 4 3 125 231
15 4 3 125 257
16 4 3 125 543
17 4 3 125 231
This question already has answers here:
Reshaping multiple sets of measurement columns (wide format) into single columns (long format)
(8 answers)
Closed 5 years ago.
I'd really appreciate some help getting this messy set of new survey data into a usable form. It was collected in a strange way and now I've got strange data to work with. I've looked through tidyr and used those approaches to no end. I suspect my problem is that I'm thinking about this dataset all wrong and I'm blind to some real answer. But given all the things I need to do to this df, I cant figure out where to start and thus where to start googling.
What I need:
For each person to be their own row
Each person retains their GroupID and Treated value
For the variables currently attached to each person individually to become columns (age, weight, height)
Fake (and much smaller):
structure(list(GroupID = 1:5, Treated = c("Y", "Y", "N", "Y",
"N"), person1_age = c(45L, 33L, 71L, 19L, 52L), person1_weight = c(187L,
145L, 136L, 201L, 168L), person1_height = c(69L, 64L, 51L, 70L,
66L), person2_age = c(54L, 20L, 48L, 63L, 26L), person2_weight = c(140L,
122L, 186L, 160L, 232L), person2_height = c(62L, 70L, 65L, 72L,
74L), person3_age = c(21L, 56L, 40L, 59L, 67L), person3_weight = c(112L,
143L, 187L, 194L, 159L), person3_height = c(61L, 69L, 73L, 63L,
72L)), .Names = c("GroupID", "Treated", "person1_age", "person1_weight",
"person1_height", "person2_age", "person2_weight", "person2_height",
"person3_age", "person3_weight", "person3_height"), row.names = c(NA,
5L), class = "data.frame")
Any help or further readings you could point me to would be very much appreciated.
reshape can do this, with the appropriate arguments:
> reshape(x, direction="long", varying=names(x)[3:11], timevar='person', v.names=c('height', 'age', 'weight'), sep='_')
GroupID Treated person height age weight id
1.1 1 Y 1 187 45 69 1
2.1 2 Y 1 145 33 64 2
3.1 3 N 1 136 71 51 3
4.1 4 Y 1 201 19 70 4
5.1 5 N 1 168 52 66 5
1.2 1 Y 2 140 54 62 1
2.2 2 Y 2 122 20 70 2
3.2 3 N 2 186 48 65 3
4.2 4 Y 2 160 63 72 4
5.2 5 N 2 232 26 74 5
1.3 1 Y 3 112 21 61 1
2.3 2 Y 3 143 56 69 2
3.3 3 N 3 187 40 73 3
4.3 4 Y 3 194 59 63 4
5.3 5 N 3 159 67 72 5
This relies on the order of the columns in your original data, for the varying argument, being in increasing order in the original data.
If that's not the case, specify varying manually. Here's what is used above:
> names(x)[3:11]
[1] "person1_age" "person1_weight" "person1_height" "person2_age" "person2_weight" "person2_height"
[7] "person3_age" "person3_weight" "person3_height"
We can also use melt from data.table which can take multiple patterns in the measure argument
library(data.table)
melt(setDT(x), measure = patterns("age$", "weight$", "height$"),
variable.name = "person", value.name = c("age", "weight", "height"))
# GroupID Treated person age weight height
# 1: 1 Y 1 45 187 69
# 2: 2 Y 1 33 145 64
# 3: 3 N 1 71 136 51
# 4: 4 Y 1 19 201 70
# 5: 5 N 1 52 168 66
# 6: 1 Y 2 54 140 62
# 7: 2 Y 2 20 122 70
# 8: 3 N 2 48 186 65
# 9: 4 Y 2 63 160 72
#10: 5 N 2 26 232 74
#11: 1 Y 3 21 112 61
#12: 2 Y 3 56 143 69
#13: 3 N 3 40 187 73
#14: 4 Y 3 59 194 63
#15: 5 N 3 67 159 72
I have a data table that repeats records. I would like to transpose the table but into the unique record names.
Below is a sample of the Data table:
V1 V2 id
ClientID 29 1
CheckID 201 1
PaymentAmount 256 1
Gross 301 1
Net 256 1
Invested 130 1
Invested 53 1
Invested 118 1
ClientID 31 2
CheckID 222 2
PaymentAmount 41 2
Gross 46 2
Net 41 2
Invested 46 2
ClientID 43 3
CheckID 310 3
PaymentAmount 41 3
Gross 46 3
Net 41 3
Invested 46 3
You can see from the table above that the record in X1 called "Investment" can occur more than once for a single ClientID. I'd like to transpose the data so that it looks as such:
ClientID CheckID PaymentAmount Gross Net Invested ID
29 201 256 301 256 130 1
29 201 256 301 256 53 1
29 201 256 301 256 118 1
31 222 41 46 41 46 2
43 310 41 46 41 46 3
43 310 41 46 41 48 3
any support is greatly appreciated!
We can create a sequence column grouped by the "V1", "id" column using data.table, then convert from 'long' to 'wide' format with dcast and replace the NA with the non-NA preceding values using na.locf from zoo.
library(data.table)
library(zoo)
setDT(df1)[, N:= 1:.N , by = .(V1, id)]
dcast(df1, id+N~V1, value.var="V2")[, lapply(.SD, na.locf),
by = id, .SDcols = CheckID:PaymentAmount]
# id CheckID ClientID Gross Invested Net PaymentAmount
#1: 1 201 29 301 130 256 256
#2: 1 201 29 301 53 256 256
#3: 1 201 29 301 118 256 256
#4: 2 222 31 46 46 41 41
#5: 3 310 43 46 46 41 41
data
df1 <- structure(list(V1 = c("ClientID", "CheckID", "PaymentAmount",
"Gross", "Net", "Invested", "Invested", "Invested", "ClientID",
"CheckID", "PaymentAmount", "Gross", "Net", "Invested", "ClientID",
"CheckID", "PaymentAmount", "Gross", "Net", "Invested"), V2 = c(29L,
201L, 256L, 301L, 256L, 130L, 53L, 118L, 31L, 222L, 41L, 46L,
41L, 46L, 43L, 310L, 41L, 46L, 41L, 46L), id = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L)), .Names = c("V1", "V2", "id"), class = "data.frame",
row.names = c(NA, -20L))