Sorting data in R and how to extract values? - r

This is my dataset and I am new to R and iam trying to write a script for this data set.
R> head(KenTau)
Age CapReg TrSw FeelChk CanSw
1 20 1 0 0 0
2 36 1 0 0 0
3 35 1 3 2 2
4 21 0 0 2 2
5 43 0 0 2 2
6 34 1 0 0 0
I want to compare TrSw with rest of the colmn variable i.e
TrSw Vs Age
TrSw Vs CapReg
TrSw Vs FeelChk
TrSw Vs CanSw
I use this one to run it on R and I use this command
cor.test(KenTau$Age, KenTau$TrSw, alternative="two.sided", method="kendall")
also I want to extract Age and pvalue so I can have a list as I have close to 50 variables.
dput() of data:
KenTau <- structure(list(Age = c(20L, 36L, 35L, 21L, 43L, 34L, 37L, 62L,
54L, 47L, 48L, 45L, 2L, 2L, 2L, 54L, 52L, 40L, 58L, 29L, 27L,
28L, 46L, 35L, 50L, 31L, 48L, 2L, 29L, 54L, 52L, 28L, 28L, 26L,
38L, 59L, 51L, 58L, 39L, 44L, 53L, 2L, 39L, 55L, 48L, 2L, 23L,
51L, 50L, 26L, 28L, 40L, 38L, 61L, 52L, 33L, 2L, 59L, 27L, 45L,
45L, 57L, 66L, 52L, 58L, 34L, 28L, 39L, 48L, 53L, 39L, 46L, 57L,
36L, 25L, 22L, 29L, 46L, 25L, 25L, 35L, 44L, 24L, 26L, 33L, 27L,
41L, 28L, 26L, 32L, 36L, 35L, 32L, 33L, 29L, 29L, 52L, 55L, 23L,
29L, 45L, 26L, 48L, 54L, 50L, 35L, 27L, 39L, 41L, 30L, 30L, 31L,
27L, 28L, 27L, 25L, 34L, 23L, 30L, 34L, 52L, 20L, 31L, 2L, 45L,
34L, 21L, 60L, 34L, 40L, 47L, 30L, 54L, 36L, 32L, 31L, 55L, 57L,
23L, 31L, 26L, 26L, 27L, 19L, 26L, 25L, 37L, 47L, 38L, 38L, 26L,
25L, 41L), CapReg = c(1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
TrSw = c(0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 3L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L), FeelChk = c(0L, 0L, 2L, 2L, 2L,
0L, 2L, 2L, 2L, 3L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 2L, 0L, 1L,
0L, 1L, 2L, 2L, 1L, 1L, 0L, 2L, 2L, 1L, 2L, 2L, 0L, 1L, 2L,
0L, 1L, 2L, 2L, 3L, 0L, 2L, 1L, 0L, 0L, 2L, 1L, 2L, 2L, 1L,
1L, 0L, 1L, 2L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 3L, 1L, 2L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 2L, 3L, 1L, 2L, 2L, 1L, 1L, 0L,
2L, 1L, 0L, 1L, 1L, 0L, 2L, 1L, 1L, 0L, 0L, 0L, 2L, 1L, 2L,
1L, 0L, 0L, 0L, 0L, 2L, 0L, 1L, 0L, 2L, 2L, 2L, 0L, 0L, 2L,
3L, 2L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 0L, 0L, 1L, 2L, 2L,
1L, 1L, 2L, 0L, 3L, 1L, 0L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 0L,
0L, 2L, 0L, 2L, 2L, 3L, 0L, 1L, 1L, 2L, 0L, 0L, 0L), CanSw = c(0L,
0L, 2L, 2L, 2L, 0L, 2L, 2L, 2L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 2L, 2L, 0L, 0L, 0L, 2L, 2L, 0L, 0L, 2L, 2L, 2L, 3L, 2L,
2L, 0L, 0L, 2L, 0L, 0L, 2L, 2L, 1L, 1L, 2L, 0L, 0L, 2L, 2L,
3L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 0L, 1L, 0L, 2L, 1L, 3L, 1L,
0L, 0L, 2L, 0L, 0L, 0L, 2L, 0L, 1L, 1L, 1L, 2L, 0L, 1L, 2L,
2L, 1L, 1L, 0L, 2L, 0L, 0L, 1L, 0L, 0L, 2L, 1L, 0L, 0L, 0L,
0L, 2L, 1L, 2L, 0L, 2L, 2L, 0L, 1L, 2L, 0L, 1L, 0L, 2L, 2L,
2L, 0L, 0L, 2L, 3L, 2L, 0L, 0L, 2L, 2L, 2L, 2L, 2L, 2L, 0L,
0L, 0L, 2L, 2L, 1L, 1L, 2L, 1L, 0L, 0L, 2L, 0L, 1L, 2L, 2L,
1L, 1L, 0L, 0L, 2L, 2L, 0L, 2L, 2L, 3L, 1L, 1L, 0L, 2L, 0L,
2L, 0L)), .Names = c("Age", "CapReg", "TrSw", "FeelChk",
"CanSw"), class = "data.frame", row.names = c(NA, -153L))

Whilst I'm not convinced of the statistical merits of generating the p-values for 50 correlations, this is quite easy to do with lapply() and friends.
For this I chose to iterate over the indices of names of KenTau that are not "TrSw" as that is the variable you wish to compare all others with. I first grab those indices using which():
R> inds <- which(names(KenTau) != "TrSw")
R> inds
[1] 1 2 4 5
Next I set up a call to lapply(), where I will iterate over inds. I now need an anonymous function that takes an index ind as the first argument (this is what lapply() will pass my function at each iteration), and I need to pass in the data, which I do so as argument x. My anonymous function calls cor.test() as you show in your example, but notice how x[, ind] is used to refer to the current index or column we are correlating with TrSw. The last part of the lapply() call says to pass as x, the data KenTau so that whenever you see x in the anonymous function this really refers to a copy of KenTau:
cors <- lapply(inds,
function(ind, x) {
cor.test(x[, ind], x[, "TrSw"], alternative="two.sided",
method="kendall")
}, x = KenTau)
Adding some names to the list that is cors will help later so do that now:
names(cors) <- names(KenTau)[inds]
If we look at cors we see that it is a list:
R> str(cors, max = 1)
List of 4
$ Age :List of 8
..- attr(*, "class")= chr "htest"
$ CapReg :List of 8
..- attr(*, "class")= chr "htest"
$ FeelChk:List of 8
..- attr(*, "class")= chr "htest"
$ CanSw :List of 8
..- attr(*, "class")= chr "htest"
each element of the list being an object of class "htest", which is what cor.test() returns. There are four such objects because there were four variables to compare with TrSw.
You wish to extract the p-value, so we need to see where this is stored in an "htest" object:
R> str(cors[[1]])
List of 8
$ statistic : Named num 1.57
..- attr(*, "names")= chr "z"
$ parameter : NULL
$ p.value : num 0.116
$ estimate : Named num 0.105
..- attr(*, "names")= chr "tau"
$ null.value : Named num 0
..- attr(*, "names")= chr "tau"
$ alternative: chr "two.sided"
$ method : chr "Kendall's rank correlation tau"
$ data.name : chr "x[, ind] and x[, \"TrSw\"]"
- attr(*, "class")= chr "htest"
The above output shows the p-value is stored in component p.value. To extract all 4 p-values, we want to, in effect, do this:
res[[i]][["p.value"]]
where i is each element of cors in turn. For this we could use lapply() again, but sapply() will simplify the result to a vector for us, which is neater in this case. The sapply() call will pass us each res[[i]] turn, so we just need to apply the [[ function (yes, it may not look like one, but it very much is a function; "[["()). That function takes a single argument (in this case we can use the name of the component we want to extract), which I pass in as "p.value":
res <- sapply(cors, `[[`, "p.value")
Because I added names to cors, sapply() will return a named vector containing the p-values of the correlation between the named variable an TrSw:
R> res
Age CapReg FeelChk CanSw
1.157889e-01 3.920115e-01 2.189736e-04 1.578040e-06
If you want another component of the result, say the test statistic itself, then replace "p.value" with the name of the component you want, e.g. "statistic" to get Kendall's Tau.
If you are going to be doing this for a lot of variables, go and read about multiple tests and adjusting p-values as I'm not convinced your results would be that useful as just 50 correlations.

Related

How do I create a ggplot in R from a non-linear model using the mgcv package?

I have a non-linear survival model which I have coded using the mgcv package. I can produce a regular plot, but I would like to be able to do code a ggplot2 instead. How do I go about this?
Here is my code:
df <- structure(list(SurvYear =c(3L, 2L, 3L, 6L, 8L, 3L, 5L, 2L, 9L,
8L, 1L, 7L, 1L, 4L, 6L, 8L, 2L, 5L, 1L, 1L, 7L, 1L, 5L, 3L, 2L,
1L, 9L, 1L, 5L, 2L, 2L, 1L, 2L, 3L, 4L, 8L, 7L, 2L, 2L, 6L, 9L,
7L, 3L, 9L, 6L, 8L, 2L, 8L, 2L, 1L, 1L, 6L, 5L, 3L, 3L, 7L, 2L,
4L, 5L, 2L, 3L, 7L, 4L, 1L, 2L, 2L, 3L, 5L, 1L, 9L, 2L, 2L, 3L,
9L, 6L, 2L, 2L, 4L, 3L, 1L, 9L, 7L, 3L, 1L, 2L, 1L, 6L, 3L, 1L,
5L, 6L, 5L, 6L, 4L, 2L, 1L, 3L, 1L, 1L, 3L, 4L, 3L, 8L, 9L, 7L,
6L, 3L, 5L, 2L, 7L, 9L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 9L, 1L,
4L, 8L, 1L, 8L, 1L, 1L, 8L, 5L, 2L, 9L, 4L, 8L, 4L, 9L, 2L, 2L,
3L, 2L, 9L, 3L, 2L, 1L, 3L, 2L, 1L, 9L, 9L, 2L, 1L, 1L, 1L, 2L,
9L, 1L, 5L, 1L, 6L, 9L, 3L, 2L, 2L, 5L, 7L, 4L, 2L, 7L, 2L, 4L,
5L, 3L, 3L, 9L, 2L, 6L, 1L, 3L, 4L, 5L, 9L, 8L, 1L, 2L, 8L, 2L,
9L, 1L, 7L, 3L, 3L, 1L, 6L, 3L, 4L, 9L, 1L, 3L, 4L, 4L, 2L, 7L,
2L, 3L, 1L, 1L, 7L, 2L, 1L, 1L, 2L, 1L, 9L, 1L, 2L, 9L, 1L, 1L,
2L, 3L, 7L, 3L, 1L, 1L, 2L, 5L, 4L, 6L, 7L, 1L, 9L, 2L, 1L, 8L,
1L, 2L, 1L, 4L, 2L, 3L, 3L, 9L, 9L, 9L, 4L, 1L, 1L, 4L, 9L, 3L,
1L, 1L, 3L, 3L, 4L, 1L, 1L, 1L, 1L, 6L, 9L, 1L, 1L, 8L, 1L, 3L,
3L, 8L, 3L, 5L, 1L, 2L, 1L, 2L, 4L, 3L, 1L, 6L, 1L, 4L, 8L, 1L,
3L, 2L, 2L, 3L, 6L, 2L, 1L, 1L, 1L, 9L, 3L, 1L, 7L, 3L, 9L, 1L,
9L, 5L, 4L), Gender = c(1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L,
1L, 1L), Age = c(63L, 66L, 34L, 43L, 63L, 21L, 24L, 44L, 52L,
59L, 27L, 32L, 30L, 20L, 56L, 55L, 35L, 26L, 53L, 43L, 39L, 19L,
34L, 28L, 19L, 24L, 50L, 22L, 58L, 24L, 50L, 25L, 37L, 30L, 51L,
69L, 23L, 49L, 22L, 46L, 58L, 31L, 23L, 53L, 59L, 25L, 38L, 44L,
34L, 49L, 19L, 39L, 24L, 51L, 29L, 27L, 48L, 77L, 22L, 43L, 59L,
49L, 60L, 51L, 49L, 47L, 50L, 44L, 41L, 44L, 50L, 42L, 46L, 54L,
35L, 21L, 26L, 26L, 40L, 21L, 48L, 49L, 20L, 20L, 32L, 37L, 22L,
36L, 46L, 28L, 39L, 35L, 51L, 39L, 49L, 57L, 46L, 18L, 52L, 47L,
27L, 32L, 23L, 43L, 42L, 57L, 22L, 40L, 19L, 58L, 71L, 55L, 42L,
20L, 51L, 21L, 20L, 61L, 36L, 54L, 19L, 35L, 38L, 41L, 34L, 22L,
41L, 42L, 56L, 50L, 53L, 53L, 48L, 22L, 59L, 27L, 28L, 32L, 37L,
68L, 24L, 26L, 61L, 21L, 20L, 20L, 50L, 62L, 61L, 29L, 18L, 40L,
67L, 43L, 25L, 43L, 22L, 56L, 47L, 41L, 40L, 43L, 27L, 37L, 61L,
35L, 23L, 54L, 38L, 38L, 39L, 45L, 49L, 63L, 49L, 44L, 44L, 23L,
37L, 58L, 61L, 25L, 18L, 59L, 25L, 51L, 40L, 27L, 42L, 22L, 38L,
22L, 45L, 33L, 32L, 36L, 53L, 52L, 19L, 45L, 53L, 27L, 65L, 25L,
53L, 57L, 29L, 23L, 62L, 36L, 56L, 59L, 41L, 61L, 44L, 24L, 21L,
38L, 29L, 55L, 33L, 18L, 21L, 19L, 65L, 24L, 59L, 34L, 25L, 45L,
48L, 18L, 41L, 61L, 32L, 37L, 21L, 20L, 57L, 25L, 65L, 50L, 61L,
32L, 27L, 19L, 50L, 63L, 19L, 45L, 20L, 36L, 20L, 19L, 53L, 39L,
50L, 20L, 24L, 57L, 28L, 21L, 39L, 49L, 21L, 20L, 39L, 20L, 44L,
19L, 39L, 53L, 29L, 60L, 43L, 21L, 23L, 30L, 42L, 42L, 51L, 35L,
50L, 51L, 56L, 52L, 22L, 36L, 56L, 28L, 57L, 20L, 47L, 48L, 65L,
71L, 21L, 70L, 23L, 63L), Highest_Educationmx = c(4L, 5L, 3L,
2L, 3L, 2L, 3L, 1L, 3L, 1L, 7L, 3L, 2L, 3L, 3L, 2L, 6L, 2L, 3L,
6L, 3L, 2L, 2L, 7L, 2L, 1L, 2L, 3L, 6L, 3L, 5L, 3L, 5L, 6L, 2L,
1L, 5L, 2L, 5L, 1L, 1L, 3L, 2L, 3L, 1L, 7L, 5L, 4L, 7L, 3L, 1L,
1L, 6L, 3L, 3L, 2L, 4L, 6L, 5L, 4L, 2L, 6L, 1L, 3L, 4L, 2L, 1L,
5L, 5L, 3L, 1L, 5L, 3L, 3L, 1L, 4L, 2L, 3L, 5L, 3L, 1L, 4L, 2L,
1L, 2L, 7L, 2L, 5L, 3L, 2L, 6L, 1L, 1L, 3L, 4L, 1L, 5L, 1L, 3L,
4L, 2L, 7L, 2L, 4L, 4L, 7L, 4L, 6L, 3L, 1L, 2L, 1L, 5L, 5L, 1L,
5L, 2L, 7L, 3L, 4L, 2L, 4L, 2L, 4L, 2L, 2L, 4L, 1L, 2L, 1L, 2L,
6L, 1L, 2L, 5L, 2L, 2L, 5L, 1L, 6L, 5L, 2L, 1L, 2L, 1L, 1L, 3L,
2L, 4L, 3L, 2L, 3L, 1L, 5L, 5L, 7L, 1L, 3L, 3L, 2L, 1L, 3L, 4L,
5L, 1L, 1L, 3L, 3L, 3L, 5L, 3L, 6L, 4L, 3L, 1L, 3L, 5L, 7L, 1L,
3L, 4L, 5L, 3L, 3L, 1L, 1L, 1L, 7L, 3L, 1L, 4L, 3L, 3L, 5L, 1L,
4L, 5L, 4L, 2L, 5L, 3L, 1L, 1L, 5L, 4L, 7L, 5L, 2L, 2L, 5L, 3L,
1L, 1L, 2L, 3L, 5L, 3L, 7L, 5L, 1L, 5L, 3L, 1L, 1L, 1L, 1L, 7L,
5L, 7L, 3L, 1L, 5L, 7L, 6L, 3L, 7L, 2L, 2L, 3L, 1L, 2L, 1L, 5L,
5L, 2L, 4L, 1L, 1L, 2L, 1L, 4L, 7L, 3L, 2L, 5L, 3L, 2L, 4L, 2L,
1L, 7L, 5L, 2L, 2L, 2L, 3L, 4L, 1L, 2L, 5L, 2L, 3L, 3L, 1L, 3L,
2L, 3L, 5L, 1L, 3L, 1L, 5L, 4L, 5L, 4L, 5L, 5L, 5L, 1L, 3L, 3L,
1L, 3L, 6L, 3L, 4L, 3L, 3L, 5L, 3L), Censor = c(0L, 1L, 1L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L,
0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,
1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L)), class = "data.frame",
row.names = c(NA, -300L))
Here is the script:
library(mgcv)
library(ggplot2)
#Run the model
Model1 <- gam(SurvYear~
(Gender)+
s(Age, k=50)+
s(Highest_Educationmx, k=7),
weights=Censor, data=df, gamma=1.5, family=cox.ph())
summary(Model1)
#Build a perspective chart
vis.gam(Model1, view=c("Age","Highest_Educationmx"),
plot.type="persp", color="gray", se=-1, theta=45, phi=25,
xlab="Age", ylab= "Highest Education",
ticktype="detailed", zlim=c(-5.00, 2.00))
#Plot individual predictors using plot command from mgcv
plot(Model1, all.terms=T, rug=T, residuals=F, se=T, shade=T, seWithMean=T)
#Plot individual predictors using ggplot instead of plot command from mgcv
#UNSURE HOW DO TO THIS
I'm biased (I wrote it) but you can use the gratia package for this.
You can use the draw() function as a replacement for plot.gam(), and if you want total control, just use evaluate_smooth() to produce a tidy representation of the smooth which is then easily plotted using ggplot2.
Here is the script based on the suggestion from Gavin Simpson above:
library(gratia)
#Plot individual predictors using ggplot instead of the plot command from mgcv
sm <- gratia::evaluate_smooth(Model1, "Age")
ggplot(sm, aes(x=Age, y=est)) + geom_line(size=1.0) +
geom_ribbon(aes(ymax=est+se, ymin=est-se), alpha=0.20) +
coord_cartesian(xlim=c(20.00, 75.00), ylim=c(-2.00, 1.00)) +
scale_x_continuous(breaks=seq(20.00, 75.00, 5.00)) +
scale_y_continuous(breaks=seq(-2.00, 1.00, 1.00)) +
labs(title="Age") +
xlab("Age") +
ylab("Linear Risk Score") +
theme(plot.title=element_text(size=10)) +
geom_hline(yintercept=0, linetype="dashed", size=0.5) +
geom_vline(xintercept=mean(df$Age), linetype="dashed", size=0.5)

dplyr::left_join() produces an unexpected error

I have
> head(p)
studie sex n_fjernet n_sygdom
1 Group1 Male 22 1
2 Group1 Male 61 2
3 Group1 Female 50 1
4 Group1 Female 47 3
5 Group1 Female 30 1
6 Group1 Female 60 0
and
> head(u)
studie alder sex n_fjernet n_sygdom n_otte
1 Group4 59 Female 26 0 0
2 Group4 85 Male 7 1 1
3 Group4 74 Female 17 9 6
4 Group4 78 Male 13 0 0
5 Group4 41 Male 11 0 0
6 Group4 62 Male 12 0 0
I want to add u$n_otte to p for all cases of p$studie==u$studieandp$sex==u$sexandp$n_fjernet==u$n_fjernetandp$n_sygdom==u$n_sygdom, which is 895 cases in u out of the total of 1485 cases in p. All cases in p that does not match and gets u$n_otte left_joined(), should just be listed as NA
So I wrote
left_join(p, u %>% distinct(studie, sex, n_fjernet, n_sygdom, .keep_all = TRUE), by = "n_otte")
Which returned an error
Error: `by` can't contain join column `n_otte` which is missing from LHS
I tried different left_join() approaches but all returned an error. What am I doing wrong?
u <- structure(list(studie = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Group4",
"Group3"), class = "factor"), sex = structure(c(1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L
), .Label = c("Female", "Male"), class = "factor"), n_fjernet = c(26L,
7L, 17L, 13L, 11L, 12L, 8L, 2L, 14L, 8L, 35L, 23L, 5L, 20L, 11L,
5L, 30L, 12L, 23L, 37L, 13L, 26L, 9L, 9L, 9L, 15L, 39L, 13L,
5L, 9L, 19L, 32L, 18L, 16L, 45L, 35L, 25L, 20L, 27L, 34L, 11L,
44L, 20L, 48L, 92L, 6L, 29L, 12L, 26L, 37L, 30L, 54L, 32L, 39L,
15L, 21L, 22L, 34L, 39L, 30L, 36L, 19L, 26L, 43L, 26L, 42L, 18L,
15L, 32L, 29L, 36L, 28L, 38L, 35L, 66L, 11L, 49L, 32L, 61L, 49L,
36L, 51L, 42L, 13L, 10L, 36L, 45L, 49L, 52L, 21L, 42L, 29L, 38L,
28L, 37L, 47L, 33L, 50L, 19L, 45L, 23L, 29L, 31L, 59L, 60L, 32L,
32L, 30L, 50L, 29L, 32L, 42L, 24L, 22L, 47L, 24L, 22L, 8L, 38L,
25L, 34L, 45L, 50L, 51L, 28L, 8L, 21L, 17L, 30L, 36L, 20L, 56L,
23L, 77L, 23L, 76L, 58L, 35L, 33L, 52L, 34L, 17L, 66L, 38L, 58L,
16L, 58L, 44L, 22L, 42L, 17L, 33L, 9L, 31L, 15L, 46L, 31L, 32L,
25L, 17L, 31L, 35L, 29L, 18L, 69L, 28L, 25L, 35L, 19L, 18L, 15L,
51L, 41L, 55L, 35L, 19L, 45L, 24L, 39L, 57L, 45L, 37L, 30L, 33L,
34L, 47L, 21L, 16L, 22L, 26L, 36L, 32L, 17L, 28L, 32L, 35L, 37L,
30L, 32L, 29L, 41L, 18L, 26L, 32L, 30L, 17L, 35L, 17L, 27L, 27L,
10L, 30L, 50L, 28L, 22L, 13L, 32L, 35L, 51L, 44L, 16L, 17L, 43L,
27L, 21L, 34L, 13L, 18L, 37L, 20L, 8L, 19L, 43L, 24L, 48L, 15L,
11L, 22L, 20L, 19L, 20L, 23L, 12L, 31L, 28L, 34L, 25L, 22L, 38L,
28L, 26L, 30L, 45L, 50L, 39L, 22L, 41L, 14L, 60L, 35L, 10L, 29L,
24L, 25L, 31L, 32L, 33L, 10L, 16L, 10L, 10L, 32L, 30L, 34L, 31L,
24L, 15L, 20L, 20L, 31L, 33L, 15L, 27L, 19L, 40L, 17L, 48L, 35L,
25L, 25L, 22L, 19L, 24L, 20L, 30L, 13L, 28L, 19L, 7L, 29L, 18L,
41L, 11L, 42L, 35L, 24L, 16L, 29L, 39L, 28L, 32L, 16L, 31L, 30L,
27L, 17L, 28L, 29L, 12L, 25L, 30L, 14L, 19L, 13L, 32L, 16L, 12L,
24L, 10L, 34L, 49L, 17L, 11L, 37L, 38L, 36L, 18L, 42L, 14L, 33L,
41L, 21L, 10L, 16L, 16L, 14L, 32L, 25L, 22L, 19L, 28L, 16L, 24L,
28L, 29L, 34L, 27L, 23L, 33L, 23L, 57L, 30L, 16L, 13L, 20L, 42L,
14L, 18L, 31L, 19L, 22L, 27L, 11L, 12L, 7L, 25L, 29L, 35L, 21L,
64L, 39L, 51L, 21L, 16L, 36L, 22L, 15L, 29L, 38L, 20L, 23L, 5L,
33L, 15L, 20L, 52L, 31L, 16L, 10L, 12L, 47L, 23L, 28L, 27L, 18L,
24L, 34L, 45L, 24L, 43L, 28L, 34L, 20L, 26L, 17L, 41L, 25L, 38L,
35L, 25L, 21L, 24L, 21L, 24L, 14L, 40L, 19L, 11L, 21L, 38L, 43L,
23L, 28L, 17L, 78L, 12L, 27L, 16L, 24L, 16L, 21L, 43L, 25L, 50L,
44L, 30L, 33L, 31L, 20L, 47L, 47L, 34L, 22L, 31L, 28L, 51L, 23L,
45L, 30L, 34L, 32L, 39L, 41L, 25L, 15L, 19L, 14L, 41L, 40L, 49L,
27L, 35L, 26L, 22L, 59L, 10L, 29L, 38L, 64L, 16L, 36L, 56L, 31L,
50L, 23L, 27L, 49L, 30L, 28L, 25L, 38L, 37L, 25L, 30L, 23L, 18L,
31L, 48L, 47L, 49L), n_sygdom = c(0L, 1L, 9L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 4L, 0L, 0L, 21L, 0L, 2L,
0L, 0L, 0L, 2L, 1L, 1L, 0L, 0L, 2L, 2L, 0L, 0L, 7L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 7L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 5L, 6L, 0L, 1L,
0L, 1L, 0L, 0L, 1L, 0L, 3L, 0L, 0L, 19L, 2L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 4L, 0L, 0L, 0L, 0L, 0L, 3L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 5L, 0L, 2L, 6L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 16L, 1L, 6L, 0L, 2L, 5L, 0L, 0L, 0L, 0L, 3L, 0L, 2L, 3L,
4L, 0L, 1L, 0L, 0L, 0L, 4L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 4L, 0L, 9L, 0L, 0L, 0L, 1L, 0L, 2L, 0L, 0L, 0L, 2L,
2L, 3L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 5L, 1L, 5L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L,
2L, 5L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 2L, 0L, 14L, 3L, 0L, 0L, 0L, 0L, 4L, 1L, 0L, 0L, 2L, 0L,
1L, 0L, 0L, 1L, 0L, 2L, 0L, 5L, 0L, 0L, 0L, 1L, 0L, 0L, 4L, 0L,
1L, 1L, 3L, 0L, 2L, 0L, 0L, 0L, 2L, 7L, 18L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 3L, 1L, 0L, 0L, 6L, 1L, 0L, 0L, 7L, 2L,
0L, 0L, 0L, 1L, 0L, 8L, 0L, 0L, 3L, 3L, 1L, 3L, 2L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 3L, 0L, 4L, 0L, 0L,
1L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 2L, 0L, 0L, 9L, 0L, 0L,
6L, 0L, 1L, 0L, 1L, 1L, 2L, 0L, 5L, 4L, 0L, 4L, 0L, 0L, 0L, 2L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L, 0L, 0L, 3L,
0L, 3L, 0L, 0L, 0L, 0L, 0L, 5L, 0L, 3L, 1L, 7L, 3L, 0L, 0L, 2L,
0L, 1L, 0L, 0L, 0L, 2L, 0L, 2L, 0L, 3L, 1L, 0L, 3L, 0L, 0L, 4L,
0L, 1L, 5L, 4L, 16L, 0L, 1L, 5L, 1L, 0L, 1L, 0L, 0L, 0L, 3L,
0L, 4L, 2L, 4L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), n_otte = c(0L, 1L, 6L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 3L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 6L, 0L, 3L, 0L, 0L, 0L,
2L, 6L, 6L, 0L, 0L, 4L, 6L, 0L, 0L, 6L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 6L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 6L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 2L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 6L, 0L, 4L, 3L, 0L, 1L, 0L, 1L, 0L, 0L,
1L, 0L, 6L, 0L, 0L, 6L, 6L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 3L, 0L,
0L, 0L, 0L, 0L, 4L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 6L,
0L, 3L, 4L, 0L, 0L, 6L, 0L, 6L, 0L, 1L, 0L, 0L, 6L, 6L, 6L, 0L,
3L, 6L, 0L, 0L, 0L, 0L, 4L, 0L, 3L, 3L, 6L, 0L, 1L, 0L, 0L, 0L,
3L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 4L, 0L, 3L,
0L, 0L, 0L, 1L, 0L, 4L, 0L, 0L, 0L, 4L, 6L, 4L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 6L, 0L, 0L, 0L, 0L, 0L, 4L, 1L, 6L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 6L, 4L, 6L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 6L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 3L, 0L, 6L, 3L, 0L,
0L, 0L, 0L, 6L, 1L, 0L, 0L, 6L, 0L, 1L, 0L, 0L, 1L, 6L, 6L, 0L,
3L, 6L, 0L, 0L, 1L, 0L, 0L, 3L, 0L, 1L, 1L, 3L, 6L, 3L, 0L, 0L,
0L, 3L, 3L, 6L, 6L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 3L, 6L,
0L, 0L, 6L, 1L, 0L, 0L, 6L, 2L, 0L, 0L, 0L, 1L, 0L, 6L, 0L, 0L,
6L, 4L, 1L, 3L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 6L,
0L, 0L, 0L, 6L, 0L, 4L, 0L, 0L, 4L, 0L, 6L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 4L, 0L, 0L, 4L, 0L, 0L, 4L, 0L, 6L, 0L, 1L, 1L, 6L, 0L,
6L, 6L, 0L, 3L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 6L, 0L, 0L, 3L, 0L, 6L, 0L, 0L, 0L, 0L, 6L, 3L,
0L, 6L, 1L, 6L, 6L, 0L, 0L, 3L, 0L, 1L, 0L, 0L, 0L, 3L, 0L, 6L,
0L, 6L, 1L, 0L, 6L, 0L, 0L, 6L, 0L, 1L, 3L, 6L, 6L, 0L, 1L, 6L,
1L, 0L, 1L, 0L, 0L, 0L, 6L, 0L, 4L, 6L, 3L, 6L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
500L), class = "data.frame")
And
p <- structure(list(studie = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Group2",
"Group3", "Group4"), class = "factor"), sex = structure(c(2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L), .Label = c("Female", "Male"), class = "factor"),
n_fjernet = c(18L, 26L, 24L, 20L, 41L, 31L, 13L, 41L, 25L,
16L, 18L, 26L, 35L, 36L, 22L, 20L, 16L, 10L, 19L, 46L, 6L,
49L, 70L, 46L, 55L, 25L, 22L, 37L, 28L, 52L, 27L, 15L, 11L,
7L, 24L, 11L, 56L, 47L, 27L, 14L, 16L, 21L, 43L, 25L, 50L,
44L, 30L, 33L, 31L, 20L, 47L, 47L, 34L, 22L, 31L, 28L, 51L,
23L, 45L, 30L, 34L, 32L, 39L, 41L, 25L, 15L, 19L, 14L, 41L,
40L, 49L, 27L, 35L, 26L, 22L, 59L, 10L, 29L, 38L, 64L, 16L,
36L, 56L, 31L, 50L, 23L, 27L, 49L, 30L, 28L, 25L, 38L, 37L,
25L, 30L, 23L, 18L, 31L, 48L, 47L, 49L, 38L, 19L, 3L, 69L,
26L, 30L, 57L, 52L, 40L, 32L, 17L, 42L, 32L, 15L, 63L, 25L,
29L, 45L, 49L, 27L, 21L, 43L, 31L, 13L, 22L, 28L, 45L, 24L,
17L, 49L, 34L, 61L, 51L, 51L, 29L, 32L, 23L, 9L, 14L, 28L,
35L, 43L, 46L, 32L, 52L, 22L, 34L, 66L, 27L, 59L, 31L, 27L,
34L, 38L, 69L, 50L, 63L, 48L, 37L, 41L, 31L, 48L, 35L, 36L,
30L, 38L, 39L, 22L, 97L, 19L, 29L, 72L, 25L, 113L, 17L, 62L,
29L, 44L, 24L, 20L, 48L, 66L, 30L, 24L, 19L, 42L, 27L, 87L,
24L, 19L, 45L, 30L, 34L, 57L, 51L, 28L, 26L, 40L, 102L, 23L,
54L, 32L, 18L, 22L, 4L, 40L, 56L, 3L, 34L, 46L, 29L, 14L,
33L, 52L, 15L, 33L, 44L, 25L, 35L, 33L, 45L, 50L, 38L, 33L,
24L, 45L, 61L, 17L, 38L, 18L, 65L, 61L, 19L, 19L, 25L, 68L,
39L, 21L, 18L, 39L, 36L, 46L, 35L, 68L, 18L, 14L, 18L, 28L,
55L, 30L, 40L, 57L, 52L, 91L, 60L, 84L, 92L, 26L, 65L, 39L,
73L, 36L, 33L, 51L, 133L, 66L, 62L, 38L, 53L, 70L, 33L, 20L,
52L, 45L, 64L, 106L, 70L, 24L, 23L, 44L, 35L, 31L, 52L, 46L,
33L, 15L, 42L, 35L, 33L, 19L, 54L, 64L, 37L, 27L, 51L, 27L,
52L, 61L, 38L, 31L, 46L, 86L, 44L, 58L, 32L, 27L, 13L, 12L,
38L, 72L, 20L, 59L, 37L, 27L, 23L, 59L, 36L, 28L, 38L, 26L,
64L, 34L, 38L, 21L, 34L, 44L, 33L, 55L, 38L, 51L, 49L, 45L,
44L, 40L, 33L, 19L, 18L, 45L, 52L, 63L, 16L, 24L, 50L, 59L,
98L, 60L, 63L, 49L, 59L, 35L, 35L, 38L, 56L, 78L, 68L, 56L,
42L, 80L, 58L, 39L, 50L, 17L, 37L, 40L, 22L, 51L, 32L, 34L,
17L, 33L, 18L, 33L, 25L, 4L, 57L, 47L, 27L, 33L, 20L, 42L,
29L, 41L, 22L, 17L, 9L, 17L, 39L, 78L, 19L, 37L, 50L, 34L,
14L, 29L, 49L, 25L, 33L, 54L, 47L, 12L, 18L, 30L, 22L, 33L,
52L, 80L, 20L, 33L, 61L, 34L, 36L, 67L, 35L, 36L, 24L, 12L,
47L, 29L, 38L, 30L, 25L, 19L, 28L, 37L, 72L, 31L, 39L, 36L,
30L, 60L, 45L, 29L, 56L, 44L, 124L, 42L, 39L, 26L, 74L, 25L,
25L, 124L, 32L, 28L, 32L, 9L, 21L, 25L, 24L, 40L, 14L, 42L,
49L, 21L, 28L, 44L, 38L, 24L, 28L, 34L, 26L, 46L, 36L, 31L,
39L, 22L, 80L, 37L, 54L, 19L, 14L, 55L, 42L, 45L, 23L, 31L,
21L, 33L, 25L, 18L, 46L, 22L, 54L, 32L, 28L, 28L, 31L, 28L,
29L, 41L, 34L, 24L, 41L, 32L, 39L, 14L, 32L, 46L, 32L), n_sygdom = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5L, 3L, 8L, 5L, 8L,
3L, 6L, 3L, 3L, 3L, 6L, 13L, 7L, 16L, 12L, 5L, 4L, 6L, 10L,
8L, 3L, 7L, 6L, 6L, 10L, 5L, 7L, 8L, 5L, 3L, 2L, 3L, 4L,
4L, 2L, 4L, 5L, 2L, 2L, 5L, 2L, 2L, 12L, 7L, 3L, 7L, 4L,
9L, 6L, 3L, 3L, 4L, 1L, 12L, 3L, 3L, 4L, 3L, 2L, 2L, 3L,
2L, 3L, 2L, 4L, 8L, 2L, 2L, 3L, 4L, 4L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 5L, 1L, 9L, 2L, 22L, 3L, 2L, 6L, 4L, 2L, 3L, 3L,
2L, 4L, 4L, 4L, 4L, 3L, 17L, 2L, 7L, 2L, 1L, 4L, 6L, 6L,
8L, 8L, 5L, 2L, 3L, 3L, 3L, 3L, 5L, 2L, 2L, 2L, 2L, 2L, 4L,
4L, 6L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 2L, 2L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 3L,
2L, 2L, 2L, 3L, 3L, 4L, 3L, 2L, 3L, 2L, 2L, 8L, 2L, 3L, 3L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 3L, 1L, 3L, 13L, 4L, 9L, 4L, 3L, 2L, 3L, 4L,
3L, 2L, 8L, 4L, 10L, 10L, 2L, 3L, 6L, 8L, 6L, 3L, 3L, 2L,
7L, 5L, 3L, 12L, 2L, 2L, 1L, 2L, 3L, 1L, 2L, 5L, 2L, 7L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA,
500L), class = "data.frame")
merge(p, u, by = c('studie', 'sex', 'n_fjernet', 'n_sygdom'), all.x = T)
or
p %>%
left_join(., u, by = c('studie', 'sex', 'n_fjernet', 'n_sygdom'))

How to create Stratified Sampling for multiple columns in R

my data set has got 821049 variables and 18 columns. I would like to take 9 columns for the stratified sampling. These are "BASKETS_NZ", "PIS", "PIS_AP" "PIS_DV", "PIS_PL", "PIS_SDV", "PIS_SHOPS" "PIS_SR", "QUANTITY". My stratification variable is ID = 1:821049. How do I choose the intervals for my variables? How do I set the size of the sampling?
dpt(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
Here is a solution to perform a stratified sampling based on multiple columns. Before implementing this, consider that your data is continuous and a sufficiently large that just a random sampling is adequate.
To solve this problem is to take a stratified sample from each group. The potential approaches to group the data together is by either pasting the 9 columns together or using dplyr's groupby function.
Using the solution is this question How to get around error "factor has new levels" in cross-validation glm? and updating with dplyr style.
This dplyr_stratified function will take the desired sampling ration and an arbitrary number of column and will return a data frame with the sampled rows. See the example below for taking 2 columns.
set.seed(1)
x <- rnorm(n = 100)
y <- rep(x = c("A","B"), times = c(50,50))
z <- rep(x = c("D","E","F"), times = c(33,33,34))
data <- data.frame(x, y=sample(y, replace = TRUE), z=sample(z, replace=TRUE))
library(dplyr)
#optional tag row for later identification:
data$rowid<-1:nrow(data)
dplyr_stratified <- function(df, percent, ...){
columns<-enquos(...)
#group then sample each group
out<-df %>% group_by(!!!columns) %>% slice( sample(1:n(), percent*n()))
}
testgroup<-dplyr_stratified(data, 0.8, z, y)
testgroup
Note: this is assuming each grouping will have a sufficient number of sample in order to select a representative sample. (If the groups are too small then this approach may not meet expectations)

Plotting the distribution for multiple columns

I would like to plot the distribution of multiple columns of my data set. It has over 820.000 rows and 18 columns. I want to plot all columns except the columns with the dummy variables. I have already been able to create a graphic. But I want to have the values of the x-axis on the y-axis because these are the column values and I want to display their distribution for each column.
1. Definition of the path
setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")
2. Loading the required packages
library(factoextra); library(cluster); library(skmeans); library(mclust);
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust); library(clValid); library(plotrix)
library(graphics); library(reshape2)
3. Import csv file
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
4 Select columns
WKA_ohneJB2 <- c(WKA_ohneJB[, "BASKETS_NZ"], WKA_ohneJB[, "PIS"], WKA_ohneJB[, "PIS_AP"],
WKA_ohneJB[, "PIS_DV"], WKA_ohneJB[, "PIS_PL"], WKA_ohneJB [, "PIS_SDV"],
WKA_ohneJB[, "PIS_SHOPS"], WKA_ohneJB[,"PIS_SR"], WKA_ohneJB[, "QUANTITY"]
)
df <- melt(WKA_ohneJB2)
5 Plot
ggplot(df) +
geom_col(aes(x= WKA_ohneJB2 , y=value))
This is the plot I have generated so far.
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
6 Plotting histogram
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL","PIS_SDV", "PIS_SHOPS","PIS_SR", "QUANTITY")
par(mfrow=c(3,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}
I have created several histograms. But the scaling of the axes is wrong. I want the numerical values of the x axis to appear on the y axis and the numerical values of the y axis to appear on the x axis. How does this work? I also want the values to be displayed completely and not as e^.
You don't need to combine your dataframe all over again. What you need is either a density plot or histogram.
Also as good practice, load only the packages required for plotting, in this case it would be maybe ggplot2 and tidyr.
For example, I just used an example with 5 of the column names I can see in your data:
library(tidyr)
library(ggplot2)
WKA_ohneJB = data.frame(dummyvar=1:10000,sapply(1:5,rnorm,n=10000))
colnames(WKA_ohneJB)[-1] = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
head(WKA_ohneJB)
dummyvar BASKETS_NZ PIS PIS_AP PIS_DV PIS_PL
1 1 0.92088518 0.9167877 1.956920 4.695379 4.349631
2 2 0.05335686 2.8225161 3.059749 4.317281 5.985579
3 3 1.00141759 3.5743033 2.499662 4.761415 5.886588
4 4 -1.31231486 2.5335004 5.396917 4.364643 5.866026
5 5 -0.65336724 0.2647117 3.203358 4.838659 4.437011
6 6 0.78769080 0.3630670 2.516433 3.826074 3.741611
To one of them do:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_histogram()
Or:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_density()
To plot everything at one go, you can try to pivot it long, as you have done with melt, but I don't know if your machine can handle it, so try it for a few variables first:
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
dummyvar = "dummyvar"
ggplot(pivot_longer(WKA_ohneJB[,c(var_to_plot,dummyvar)],-dummyvar),
aes(x=value)) +
geom_histogram() +
facet_wrap(~name)
If melting the data.frame is too intensive, just use baseR plot:
# means 2 rows, 3 columns
par(mfrow=c(2,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}

R create variable IF ELSE leads to wrong values

I have a dataframe with:
"serial" the number of households, each one with a variable number of components "head, spouse, parent and child or grandchild" and total number of children in the house "nchild"
I want to create a new variable (in the dput I added an example for clarity: withCM 'living with male child' and withCF). I have tried various combinations but I cannot discriminate on the sex of the child within the same "serial", so that for withCM=1 only when relate=="child"&sex==1, but the 1 would appear on a different row (that of the head, spouse or parent)
mydata$withCM<- ifelse(mydata$nchild>0&mydata$relate!="child",1,0)
mydata <- structure(list(serial = c(12345L, 12345L, 12345L, 12345L, 12346L,
12346L, 12347L, 12347L, 12347L, 12348L, 12348L, 12348L, 12348L,
12348L, 12348L, 12348L, 12349L, 12350L, 12350L, 12351L, 12351L,
12351L, 12352L, 12352L, 12352L, 12352L, 12352L, 12353L, 12354L,
12354L), age = c(45L, 44L, 13L, 11L, 29L, 28L, 65L, 61L, 35L,
68L, 61L, 35L, 34L, 6L, 2L, 1L, 62L, 54L, 52L, 67L, 67L, 12L,
49L, 50L, 28L, 21L, 22L, 70L, 89L, 55L), sex = c(1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), relate = structure(c(4L,
7L, 1L, 1L, 4L, 7L, 6L, 6L, 4L, 4L, 7L, 1L, 2L, 3L, 3L, 3L, 4L,
4L, 7L, 4L, 7L, 3L, 4L, 7L, 1L, 5L, 5L, 4L, 6L, 4L), .Label = c("child",
"childinlaw", "grandchild", "head", "nonrelative", "parent",
"spouse"), class = "factor"), nchild = c(2L, 2L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 1L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L), conhija = c(1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), conhijo = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("serial",
"age", "sex", "relate", "nchild", "conhija", "conhijo"), class = "data.frame", row.names = c(NA,
-30L))
You can tabulate the gender, family, and role-within-family as:
xtab <- table(mydata$serial, mydata$sex, mydata$relate)
And then choose the heads of the families (or, in the commented line, anyone who has the specific relationship), and alter their tallies as follows:
mydata$sex1 <- 0
mydata$sex2 <- 0
ind <- mydata$relate=="head"
#ind <- mydata$relate %in% c("head","spouse","parent")
mydata$sex1[ind] <- xtab[as.character(mydata$serial[ind]), "1", "child"]
mydata$sex2[ind] <- xtab[as.character(mydata$serial[ind]), "2", "child"]
Use lapply to split into families, then test if they are an adult, and there is at least one male child in the unit.
lives_with_boy <- function(serial)
{
unit <- mydata[mydata$serial==serial,]
as.character(unit$relate) %in% c("head","spouse","parent") & any(unit$relate == "child" & unit$sex==1)
}
mydata$withCM <- unlist(lapply(unique(mydata$serial),lives_with_boy ))

Resources