Related
I have two columns of arrays that I would like to give individuals cells to. For example, my columns currently look like this:
NDVI
Dates
[0.1,0.5,0.7]
[1.54E12, 1.54E12, 1.54E12]
How can I take this and reformat it to:
NDVI
Dates
0.1
1.54E12
0.5
1.54E12
0.7
1.54E12
Here is some example data:
df <- structure(list(NDVIs = c("[0.0048532285033119]", "[0.031866426092985685, 0.0346768172783618, 0.05690584716758292, 0.05963840093735323, 0.049984507125651834, 0.05225985971236913, 0.05248901349444936, 0.05518163825042716, -0.010183523082805207, -0.001665440442765531, 0.03987908018516375, 0.04187215406831169, 0.08414429526637883, 0.08715243094329189, -0.005957091678983289, -0.009527640910524507, 0.010492166097587687, 0.026020594673257052, 0.002104522074938855, 0.003507257657688491, 0.06828452153139898, 0.07606416388481754, 0.06883156241096627, 0.07718980197474816, 0.06544808609317253, 0.07474777221051583, 0.10378917503028039, 0.11123383912244203, 0.05613527636023197, 0.06331642130827059, 0.06924611219668912, 0.07666941206343858, 0.06120038033501253, 0.07040186289113536, 0.0641993427579592, 0.07613137403151546, 0.02045492869437104, 0.0331589149611709, 0.07758880477807421, 0.08753787069131283, 0.07957631716729259, 0.08952965234407557, 0.0885082162194797, 0.09848859431952328, -0.0020876210575472842, 0.00246885618171861, 0.10852078141016781, 0.11765974025773, -0.026210051068679536, -0.017421882870782675, 0.10807407562845053, 0.11710267730586506, 0.11281567251547685, 0.12184561048692327, 0.07186643839535878, 0.07790008153091242, 0.022632037271678205, 0.0365365472967711]",
"[0.03529428376870003, 0.009603157045076827, 0.010105041611001576, 0.022109721139215967, 0.04320847194846547, 0.06382705400539401, 0.01733482969659961]",
"[0.05067307147600608, 0.04650335539685854, 0.04230580325409252, 0.019941600277439516, 0.03878433876933318, 0.027594882066713636, 0.032576179777168805, 0.006187539680589851, 0.0018615082099339225, 0.042639901745788876, 0.04555515255591705, 0.05557934077121789, 0.03141074118001885, -0.024594166196115066, 0.06226281017212869, 0.02775252318220509]",
"[0.22694578827413947, 0.30629362044983743, 0.26378399913683515, 0.2582198975762179, 0.25641849395215666, 0.23752142045041516, 0.23781654568851535, 0.14139758147170572, 0.2541489741719387, 0.2419639056967162, 0.2275033844898179, 0.24470134691705184, 0.22140420350534376, 0.22465304694290103, 0.21744809821978436, 0.20724760985497206, 0.19287780410696884]",
"[0.02437980239793386, 0.046501630291612854, 0.03515690678492754, 0.03996495641681273, 0.02565338402629212, 0.02522057544087444, 0.016355304453160305, 0.010076012002045382, 0.024906389352811975, 0.040025952670788865, 0.05043572220046672, 0.0676703068532929, 0.08040324690810197, 0.011653761506917892, 0.07901386245569544, 0.14084622762340893, 0.13315404483239526, 0.14297013211925677]",
"[0.0696531704758789, 0.06968717739514622, 0.060495302532228894, 0.060465281863641605, 0.050631820224526766, 0.05069794600355197, 0.04996845148127263, 0.050053255940312645, 0.052401506092804355, 0.0524735297646711, 0.040763890820717345, 0.04078582273414091, 0.04106548840491197, 0.04103821234740117, 0.003195298620254946, -0.005673033858736476, -0.0018119432210428274, 0.0015232350000433797, 0.04386980266596469, 0.044086120151773434, 0.046401011127926925, 0.04647825535002269, 0.03833234785505313, 0.03824161507110303, 0.06037120877605433, 0.06055975876554486, 0.0592133770889435, 0.05950182701095346, 0.05530416996051339, 0.05463335706292855, 0.033097906541426636, 0.02467463272473935, 0.0580151510674638, 0.05815851281926995, 0.05560044632496825, 0.05621848408403043]",
"[0.40218207770330455, 0.3833707257903159, 0.3498789365559936, 0.3415851169240495, 0.11857246658703605, 0.09110434200774196, 0.02929675584172094, 0.13218157545304937, 0.00568591277213531, 0.2312335955409189, 0.06118283356210741, 0.23127376963601848, 0.07698393284205111, 0.05667080989824748, 0.11169816100135183, 0.018740255299188605, 0.07543980191846927]",
"[0.04179551280868712, 0.0383419807710962, 0.0504998226331591, 0.0011002529304645489, 0.031002814569137, 0.022506919207544558, 0.03933692454785477, 0.029375815795034198, 0.04674437953418841, 0.04864569790993668, 0.036774224525189025, 0.002999267453586999, 0.03965538865956306, 0.04553202255868567, 0.012738284285019974, 0.05355605970787331, 0.03678091244450905]",
"[0.023817638054132135, 0.02855430777145022, 0.08243891605296151, 0.014681590484211679, 0.14321736621086276, 0.07733703130141825, 0.015297125457766068, 0.10985901950753231, 0.12587086535155378, 0.13064566485984275, 0.1297667968282656]"
), dates = c("[1.527502095E12]", "[1.544955875E12, 1.544955875E12, 1.545126378E12, 1.545126378E12, 1.545394367E12, 1.545394367E12, 1.545561339E12, 1.545561339E12, 1.545820928E12, 1.545820928E12, 1.545993046E12, 1.545993046E12, 1.546250253E12, 1.546250253E12, 1.546425701E12, 1.546425701E12, 1.546684843E12, 1.546684843E12, 1.546854495E12, 1.546854495E12, 1.547112218E12, 1.547112218E12, 1.547288454E12, 1.547288454E12, 1.547546364E12, 1.547546364E12, 1.547718166E12, 1.547718166E12, 1.547980939E12, 1.547980939E12, 1.548153141E12, 1.548153141E12, 1.548413046E12, 1.548413046E12, 1.548585114E12, 1.548585114E12, 1.548842378E12, 1.548842378E12, 1.549019277E12, 1.549019277E12, 1.549276707E12, 1.549276707E12, 1.549447219E12, 1.549447219E12, 1.549708493E12, 1.549708493E12, 1.549880513E12, 1.549880513E12, 1.550138768E12, 1.550138768E12, 1.550312936E12, 1.550312936E12, 1.55057479E12, 1.55057479E12, 1.550745218E12, 1.550745218E12, 1.551007067E12, 1.551007067E12]",
"[1.544868896E12, 1.545732881E12, 1.546164507E12, 1.546599228E12, 1.547030531E12, 1.547460976E12, 1.547892521E12]",
"[1.562323826E12, 1.562755476E12, 1.563185103E12, 1.563624031E12, 1.56405523E12, 1.564483439E12, 1.564915837E12, 1.565347309E12, 1.565777661E12, 1.566215882E12, 1.566643642E12, 1.567097272E12, 1.567507638E12, 1.568369457E12, 1.56880789E12, 1.569668195E12]",
"[1.552392943E12, 1.552819566E12, 1.553251718E12, 1.553682704E12, 1.554112987E12, 1.55454323E12, 1.554975918E12, 1.555408185E12, 1.555841425E12, 1.556271571E12, 1.55670391E12, 1.557135402E12, 1.557568058E12, 1.557998903E12, 1.558863212E12, 1.55929586E12, 1.559728791E12]",
"[1.561454836E12, 1.561888229E12, 1.562320207E12, 1.562751856E12, 1.563181463E12, 1.563618614E12, 1.564046906E12, 1.564480274E12, 1.564912267E12, 1.565343999E12, 1.565773615E12, 1.566210466E12, 1.566639283E12, 1.567072243E12, 1.567504618E12, 1.567936832E12, 1.568366042E12, 1.568802771E12]",
"[1.561028124E12, 1.561028124E12, 1.561460566E12, 1.561460566E12, 1.561887116E12, 1.561887116E12, 1.562321887E12, 1.562321887E12, 1.562750997E12, 1.562750997E12, 1.563188512E12, 1.563188512E12, 1.563622208E12, 1.563622208E12, 1.564052361E12, 1.564052361E12, 1.564478817E12, 1.564478817E12, 1.564913968E12, 1.564913968E12, 1.565342227E12, 1.565342227E12, 1.565780829E12, 1.565780829E12, 1.566212692E12, 1.566212692E12, 1.566646344E12, 1.566646344E12, 1.567071334E12, 1.567071334E12, 1.567508507E12, 1.567508507E12, 1.567935649E12, 1.567935649E12, 1.568374212E12, 1.568374212E12]",
"[1.558863212E12, 1.55929586E12, 1.559728791E12, 1.5601599E12, 1.560589582E12, 1.561026607E12, 1.561454836E12, 1.561888229E12, 1.562320207E12, 1.562751856E12, 1.563181463E12, 1.563618614E12, 1.564047988E12, 1.564480274E12, 1.564912267E12, 1.565343999E12, 1.565773615E12]",
"[1.568719125E12, 1.569150328E12, 1.569583549E12, 1.570013418E12, 1.570451296E12, 1.570878011E12, 1.571311086E12, 1.571742422E12, 1.572172948E12, 1.572604827E12, 1.573038304E12, 1.573470317E12, 1.573899161E12, 1.574334225E12, 1.575196752E12, 1.57562775E12, 1.5760617E12]",
"[1.545124633E12, 1.545561224E12, 1.545990899E12, 1.546423114E12, 1.546853425E12, 1.547287051E12, 1.547718865E12, 1.548155787E12, 1.548586075E12, 1.549015326E12, 1.549447719E12]"
)), row.names = c(NA, -10L), class = "data.frame")
We could extract the characters that are not a ] that succeeds the [ with str_extract_all, which returns a list, then unnest the columns, and split the column values at the , followed by one or more spaces (\\s+) with separate_rows
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(across(everything(), str_extract_all, "(?<=\\[)[^]]+")) %>%
unnest(c(NDVIs, dates)) %>%
separate_rows(c(NDVIs, dates), sep=",\\s+", convert = TRUE)
-output
# A tibble: 198 x 2
# NDVIs dates
# <dbl> <dbl>
# 1 0.00485 1527502095000
# 2 0.0319 1544955875000
# 3 0.0347 1544955875000
# 4 0.0569 1545126378000
# 5 0.0596 1545126378000
# 6 0.0500 1545394367000
# 7 0.0523 1545394367000
# 8 0.0525 1545561339000
# 9 0.0552 1545561339000
#10 -0.0102 1545820928000
# … with 188 more rows
Or with the [] is not in every element, then remove the already existing one with str_remove_all and use separate_rows
df %>%
mutate(across(everything(), str_remove_all, "\\[|\\]")) %>%
separate_rows(c(NDVIs, dates), sep=",\\s+", convert = TRUE)
# A tibble: 198 x 2
# NDVIs dates
# <dbl> <dbl>
# 1 0.00485 1527502095000
# 2 0.0319 1544955875000
# 3 0.0347 1544955875000
# 4 0.0569 1545126378000
# 5 0.0596 1545126378000
# 6 0.0500 1545394367000
# 7 0.0523 1545394367000
# 8 0.0525 1545561339000
# 9 0.0552 1545561339000
#10 -0.0102 1545820928000
# … with 188 more rows
A base R option :
You can remove opening and closing square brackets using gsub and split the string on comma using strsplit.
result <- data.frame(lapply(df, function(x)
unlist(strsplit(gsub('\\[|\\]', '', x), ',\\s*'))))
head(result)
# NDVIs dates
#1 0.0048532285033119 1.527502095E12
#2 0.031866426092985685 1.544955875E12
#3 0.0346768172783618 1.544955875E12
#4 0.05690584716758292 1.545126378E12
#5 0.05963840093735323 1.545126378E12
#6 0.049984507125651834 1.545394367E12
If you want both the column to be numeric you can add as.numeric in lapply i.e as.numeric(unlist(strsplit(gsub......
After a simulation I have data like that :
capt2[1,1] capt2[2,1] capt2[3,1] capt2[4,1] capt2[5,1] capt2[6,1] capt2[1,2] capt2[2,2] capt2[3,2] capt2[4,2]
1 4.582288e-05 5.115372e-05 6.409558e-05 7.132340e-05 6.927382e-05 5.727399e-05 2.753242e-05 3.106131e-05 3.832073e-05 4.270945e-05
2 4.675470e-05 5.045181e-05 6.467788e-05 7.112534e-05 6.809241e-05 5.885455e-05 2.789134e-05 3.097479e-05 3.790915e-05 4.176663e-05
3 4.586335e-05 5.127838e-05 6.344857e-05 6.934458e-05 6.622970e-05 5.651329e-05 2.795094e-05 3.120102e-05 3.790188e-05 4.172773e-05
4 4.572750e-05 5.150407e-05 6.333068e-05 7.145439e-05 6.624694e-05 5.836059e-05 2.795106e-05 3.055858e-05 3.826570e-05 4.172327e-05
5 4.740812e-05 5.113890e-05 6.397921e-05 7.163161e-05 6.838507e-05 5.620327e-05 2.790780e-05 3.083819e-05 3.821806e-05 4.198080e-05
6 4.583460e-05 5.106634e-05 6.340507e-05 7.030548e-05 6.886533e-05 5.901374e-05 2.792663e-05 3.136544e-05 3.862876e-05 4.177590e-05
with a length of 40000 lines.
However the [1: 6,] refers to months and the [, 1: x] refers to territories. So I would like to have [, 1: x] columns (in my dataset 28) for [1: 6,] rows and have the length (40000) in the third dimension since these are simulations.
Subsequently with my 3D table of 6 lines and 28 columns, I would like to do simple operations, such as for example a histogram of the 3D values of line 1 / column 1 etc ...
edit : "capt2[3,1]" it's just the name of the column in character
Just transform it into an array.
I'll simulate some data to show you how to do this.
set.seed(42)
n <- 10 # `n` in your data would be 40,000
# your rownames
v <- c("capt2[1,1]", "capt2[2,1]", "capt2[3,1]", "capt2[4,1]", "capt2[5,1]", "capt2[6,1]",
"capt2[1,2]", "capt2[2,2]", "capt2[3,2]", "capt2[4,2]", "capt2[5,2]", "capt2[6,2]",
"capt2[1,3]", "capt2[2,3]", "capt2[3,3]", "capt2[4,3]", "capt2[5,3]", "capt2[6,3]")
M <- matrix(rnorm(3*6*n), n, dimnames=list(NULL, v)) # shall symbolize your data
M[1:2, 1:6]
# capt2[1,1] capt2[2,1] capt2[3,1] capt2[4,1] capt2[5,1] capt2[6,1]
# [1,] -0.132088 0.5156677 1.3487070 1.01687283 -0.73844075 0.8131950
# [2,] 1.476787 -0.2343653 -0.0227647 -0.02671746 0.04656394 -0.1908165
Now apply array with the right dimensions and dimnames.
A <- array(as.vector(t(M)), dim=c(6, 3, n),
dimnames=list(paste0("month.", 1:6), paste0("territory.", 1:3), NULL))
A
# , , 1
#
# territory.1 territory.2 territory.3
# month.1 -0.1320880 0.4703934 -1.3870266
# month.2 0.5156677 2.4595935 1.1573471
# month.3 1.3487070 -0.1662615 -0.2901453
# month.4 1.0168728 0.4823695 1.8922020
# month.5 -0.7384408 -0.7848878 -0.2764311
# month.6 0.8131950 1.1454705 -0.3047780
#
# , , 2
#
# territory.1 territory.2 territory.3
# month.1 1.47678742 -1.24267027 -1.3066759
# month.2 -0.23436528 -0.81838032 -1.6824809
# month.3 -0.02276470 0.86256338 0.8285461
# month.4 -0.02671746 0.99294364 -1.3859983
# month.5 0.04656394 0.16341632 -1.1094188
# month.6 -0.19081647 0.03157319 0.5978327
#
# , , 3
#
# territory.1 territory.2 territory.3
# month.1 -0.2170302 1.38157546 -0.76839533
# month.2 -0.6585034 -2.11320011 0.08731909
# month.3 0.2442259 0.09734049 -0.29122771
# month.4 0.7036078 -1.24639550 -0.41482430
# month.5 -1.0175961 -1.23671424 0.13386932
# month.6 -2.6999298 -0.83520581 1.39742941
[...]
In "Zero frequent items" when using the eclat to mine frequent itemsets, the OP is interested in the groupings/clusterings based on how frequent they are ordered together. This grouping can be inspected by the arules::inspect function.
library(arules)
dataset <- read.transactions("8GbjnHK2.txt", sep = ";", rm.duplicates = TRUE)
f <- eclat(dataset,
parameter = list(
supp = 0.001,
maxlen = 17,
tidLists = TRUE))
inspect(head(sort(f, by = "support"), 10))
The data set can be downloaded from https://pastebin.com/8GbjnHK2.
However, the output cannot be easily saved to another object as a data frame.
out <- inspect(f)
So how do we capture the output of inspect(f) for use as data frame?
We can use the methods labels to extract the associations/groupings and quality to extract the quality measures (support and count). We can then use cbind to store these into a data frame.
out <- cbind(labels = labels(f), quality(f))
head(out)
# labels support count
# 1 {3031093,3059242} 0.001010 16
# 2 {3031096,3059242} 0.001073 17
# 3 {3060614,3060615} 0.001010 16
# 4 {3022540,3072091} 0.001010 16
# 5 {3061698,3061700} 0.001073 17
# 6 {3031087,3059242} 0.002778 44
Coercing the itemsets to a data.frame also creates the required output.
> head(as(f, "data.frame"))
items support count
1 {3031093,3059242} 0.001010101 16
2 {3031096,3059242} 0.001073232 17
3 {3060614,3060615} 0.001010101 16
4 {3022540,3072091} 0.001010101 16
5 {3061698,3061700} 0.001073232 17
6 {3031087,3059242} 0.002777778 44
How to I set missing values for multiple labelled vectors in a data frame. I am working with a survey dataset from spss. I am dealing with about 20 different variables, with the same missing values. So would like to find a way to use lapply() to make this work, but I can't.
I actually can do this with base R via as.numeric() and then recode() but I'm intrigued by the possibilities of haven and the labelled class so I'd like to find a way to do this all in Hadley's tidyverse
Roughly the variables of interest look like this. I am sorry if this is a basic question, but I find the help documentaiton associated with the haven and labelled packages just very unhelpful.
library(haven)
library(labelled)
v1<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v2<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v3<-data.frame(v1=v1, v2=v2)
lapply(v3, val_labels)
lapply(v3, function(x) set_na_values(x, c(5,6)))
Ok, I think I understand now what you trying to do...
i.e. Mark the labels, and the values as NA without removing the underlying imported data...
See addendum for a more detailed example that uses a public data file to show an example that harnesses dplyr to update multiple columns, labels...
Proposed Solution
df <- data_frame(s1 = c(1,2,2,2,5,6), s2 = c(1,2,2,2,5,6)) %>%
set_value_labels(s1 = c(agree=1, disagree=2, dk=5, refused=6),
s2 = c(agree=1, disagree=2, dk = tagged_na("5"), refused = tagged_na("6"))) %>%
set_na_values(s2 = c(5,6))
val_labels(df)
is.na(df$s1)
is.na(df$s2)
df
Solution Result:
> library(haven)
> library(labelled)
> library(dplyr)
> df <- data_frame(s1 = c(1,2,2,2,5,6), s2 = c(1,2,2,2,5,6)) %>%
+ set_value_labels(s1 = c(agree=1, disagree=2, dk=5, refused=6),
+ s2 = c(agree=1, disagree=2, dk = tagged_na("5"), refused = tagged_na("6"))) %>%
+ set_na_values(s2 = c(5,6))
> val_labels(df)
$s1
agree disagree dk refused
1 2 5 6
$s2
agree disagree dk refused
1 2 NA NA
> is.na(df$s1)
[1] FALSE FALSE FALSE FALSE FALSE FALSE
> is.na(df$s2)
[1] FALSE FALSE FALSE FALSE TRUE TRUE
> df
# A tibble: 6 × 2
s1 s2
<dbl+lbl> <dbl+lbl>
1 1 1
2 2 2
3 2 2
4 2 2
5 5 5
6 6 6
Now we can manipulate the data
mean(df$s1, na.rm = TRUE)
mean(df$s2, na.rm = TRUE)
> mean(df$s1, na.rm = TRUE)
[1] 3
> mean(df$s2, na.rm = TRUE)
[1] 1.75
Use Labelled package to remove labels and replace with R NA
If you wish to strip the labels and replace with R NA values you can use remove_labels(x, user_na_to_na = TRUE)
Example:
df <- remove_labels(df, user_na_to_na = TRUE)
df
Result:
> df <- remove_labels(df, user_na_to_na = TRUE)
> df
# A tibble: 6 × 2
s1 s2
<dbl> <dbl>
1 1 1
2 2 2
3 2 2
4 2 2
5 5 NA
6 6 NA
--
Explanation / Overview of SPSS Format:
IBM SPSS (The application) can import and export data in many formats and in non-rectangular configurations; however, the data set is always translated to an SPSS rectangular data file, known as a system file (using the extension *.sav). Metadata (information about the data) such as variable formats, missing values, and variable and value labels are stored with the dataset.
Value Labels
Base R has one data type that effectively maintains a mapping between integers and character labels: the factor. This, however, is not the primary use of factors: they are instead designed to automatically generate useful contrasts for linear models. Factors differ from the labelled values provided by the other tools in important ways:
SPSS and SAS can label numeric and character values, not just integer values.
Missing Values
All three tools (SPSS, SAS, Stata) provide a global “system missing value” which is displayed as .. This is roughly equivalent to R’s NA, although neither Stata nor SAS propagate missingness in numeric comparisons: SAS treats the missing value as the smallest possible number (i.e. -inf), and Stata treats it as the largest possible number (i.e. inf).
Each tool also provides a mechanism for recording multiple types of missingness:
Stata has “extended” missing values, .A through .Z.
SAS has “special” missing values, .A through .Z plus ._.
SPSS has per-column “user” missing values. Each column can declare up to three distinct values or a range of values (plus one distinct value) that should be treated as missing.
User Defined Missing Values
SPSS’s user-defined values work differently to SAS and Stata. Each column can have either up to three distinct values that are considered as missing or a range. Haven provides labelled_spss() as a subclass of labelled() to model these additional user-defined missings.
x1 <- labelled_spss(c(1:10, 99), c(Missing = 99), na_value = 99)
x2 <- labelled_spss(c(1:10, 99), c(Missing = 99), na_range = c(90, Inf))
x1
#> <Labelled SPSS double>
#> [1] 1 2 3 4 5 6 7 8 9 10 99
#> Missing values: 99
#>
#> Labels:
#> value label
#> 99 Missing
x2
#> <Labelled SPSS double>
#> [1] 1 2 3 4 5 6 7 8 9 10 99
#> Missing range: [90, Inf]
#>
#> Labels:
#> value label
#> 99 Missing
Tagged missing values
To support Stata’s extended and SAS’s special missing value, haven implements a tagged NA. It does this by taking advantage of the internal structure of a floating point NA. That allows these values to behave identical to NA in regular R operations, while still preserving the value of the tag.
The R interface for creating with tagged NAs is a little clunky because generally they’ll be created by haven for you. But you can create your own with tagged_na():
Important:
Note these tagged NAs behave identically to regular NAs, even when printing. To see their tags, use print_tagged_na():
Thus:
library(haven)
library(labelled)
v1<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v2<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=tagged_na("5"), refused= tagged_na("6")))
v3<-data.frame(v1 = v1, v2 = v2)
v3
lapply(v3, val_labels)
> v3
x x.1
1 1 1
2 2 2
3 2 2
4 2 2
5 5 5
6 6 6
> lapply(v3, val_labels)
$x
agree disagree dk refused
1 2 5 6
$x.1
agree disagree dk refused
1 2 NA NA
Word of caution:
SPSS’s user-defined values work differently to SAS and Stata. Each column can have either up to three distinct values that are considered as missing, or a range. Haven provides labelled_spss() as a subclass of labelled() to model these additional user-defined missings.
I hope the above helps
Take care
T.
References:
https://cran.r-project.org/web/packages/haven/haven.pdf
https://cran.r-project.org/web/packages/haven/vignettes/semantics.html
https://www.spss-tutorials.com/spss-missing-values-tutorial/
Addendum Example using Public Data...
SPSS Missing Values Example using an SPPS Data file {hospital.sav}
Firstly, let's make sure we highlight that
System missing values - are values that are completely absent from the data
User missing values are values that are present in the data but must be excluded from calculations.
SPSS View of Data...
Let's review the image and the data... The SPSS data shown in the variable view shows that each row has a Label [Column5], we note that rows 10 through 14 have specific values attributed to them [1..6] [Column 6] that have name attributes and that no values have been specified as Missing [Column 7].
Now let's look at the SPSS data view:
Here we can note that there is missing data... (See hilighted "."'s). The key point is that we have Missing data, but currently have no "Missing User Values"
Now let's turn to R, and load the data into R
hospital_url <- "https://www.spss-tutorials.com/downloads/hospital.sav"
hospital <- read_sav(hospital_url,
user_na = FALSE)
head(hospital,5)
# We're interested in columns 10 through 14...
head(hospital[10:14],5)
Result
> hospital_url <- "https://www.spss-tutorials.com/downloads/hospital.sav"
> hospital <- read_sav(hospital_url,
+ user_na = FALSE)
> head(hospital,5)
# A tibble: 5 × 14
visit_id patient_id first_name surname_prefix last_name gender entry_date entry_time
<dbl> <dbl> <chr> <chr> <chr> <dbl+lbl> <date> <time>
1 32943 23176 JEFFREY DIJKSTRA 1 2013-01-08 16:56:10
2 32944 20754 MARK VAN DER BERG 1 2013-02-01 14:24:45
3 32945 25419 WILLEM VERMEULEN 1 2013-02-02 10:01:43
4 32946 21139 LINDA JANSSEN 0 2013-02-10 10:24:39
5 32947 25419 WILLEM VERMEULEN 1 2013-02-10 18:05:59
# ... with 6 more variables: exit_moment <dttm>, doctor_rating <dbl+lbl>, nurse_rating <dbl+lbl>,
# room_rating <dbl+lbl>, food_rating <dbl+lbl>, facilities_rating <dbl+lbl>
Columns 10 through 14 contain Values
1="Very Dissatisfied"
2="Dissatisfied"
3="Neutral"
4="Satisfied"
5="Very Satisfied"
6="Not applicable or don't want to answer"
thus:
> head(hospital[10:14],5)
# A tibble: 5 × 5
doctor_rating nurse_rating room_rating food_rating facilities_rating
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
1 5 5 4 2 3
2 4 5 4 3 3
3 5 6 4 5 4
4 4 5 5 4 4
5 5 5 6 6 6
SPSS Value Labels
> lapply(hospital[10], val_labels)
$doctor_rating
Very dissatisfied Dissatisfied
1 2
Neutral Satisfied
3 4
Very satisfied Not applicable or don't want to answer
5 6
ok, note that above we can confirm we have imported the Value Labels.
Remove Non-Applicable data from the survey data
Our goal is to now remove the "Not applicable or don't want to answer" data entries by setting them to be "User NA values" i.e. An SPSS missing value.
Solution - Step 1 - A Single Column
We wish to set the missing value attribute across multiple columns in the data... Let first do this for one column...
Note we use add_value_labels not set_value_labels as we wish to append a new label, not completely overwrite existing labels...
d <- hospital
mean(d$doctor_rating, na.rm = TRUE)
d <- hospital %>%
add_value_labels( doctor_rating = c( "Not applicable or don't want to answer"
= tagged_na("6") )) %>%
set_na_values(doctor_rating = 5)
val_labels(d$doctor_rating)
mean(d$doctor_rating, na.rm = TRUE)
> d <- hospital
> mean(d$doctor_rating, na.rm = TRUE)
[1] 4.322368
> d <- hospital %>%
+ add_value_labels( doctor_rating = c( "Not applicable or don't want to answer"
+ = tagged_na("6") )) %>%
+ set_na_values(doctor_rating = 6)
> val_labels(d$doctor_rating)
Very dissatisfied Dissatisfied
1 2
Neutral Satisfied
3 4
Very satisfied Not applicable or don't want to answer
5 6
Not applicable or don't want to answer
NA
> mean(d$doctor_rating, na.rm = TRUE)
[1] 4.097015
Solution - Step 2 - Now apply to multiple columns...
mean(hospital$nurse_rating)
mean(hospital$nurse_rating, na.rm = TRUE)
d <- hospital %>%
add_value_labels( doctor_rating = c( "Not applicable or don't want to answer"
= tagged_na("6") )) %>%
set_na_values(doctor_rating = 6) %>%
add_value_labels( nurse_rating = c( "Not applicable or don't want to answer"
= tagged_na("6") )) %>%
set_na_values(nurse_rating = 6)
mean(d$nurse_rating, na.rm = TRUE)
Result
Note that nurse_rating contains "NaN" values and NA tagged values.
The first mean() call fails, the second succeeds but includes "Not Applicable..." after the filter the "Not Applicable..." are removed...
> mean(hospital$nurse_rating)
[1] NaN
> mean(hospital$nurse_rating, na.rm = TRUE)
[1] 4.471429
> d <- hospital %>%
+ add_value_labels( doctor_rating = c( "Not applicable or don't want to answer"
+ = tagged_na("6") )) %>%
+ set_na_values(doctor_rating = 6) %>%
+ add_value_labels( nurse_rating = c( "Not applicable or don't want to answer"
+ = tagged_na("6") )) %>%
+ set_na_values(nurse_rating = 6)
> mean(d$nurse_rating, na.rm = TRUE)
[1] 4.341085
Convert tagged NA to R NA
Here we take the above tagged NA and convert to R NA values.
d <- d %>% remove_labels(user_na_to_na = TRUE)
Not quite sure if this is what you are looking for:
v1 <- labelled(c(1, 2, 2, 2, 5, 6), c(agree = 1, disagree = 2, dk = 5, refused = 6))
v2 <- labelled(c(1, 2, 2, 2, 5, 6), c(agree = 1, disagree = 2, dk = 5, refused = 6))
v3 <- data_frame(v1 = v1, v2 = v2)
lapply(names(v3), FUN = function(x) {
na_values(v3[[x]]) <<- 5:6
})
lapply(v3, na_values)
The last line returning
$v1
[1] 5 6
$v2
[1] 5 6
Verify missing values:
is.na(v3$v1)
[1] FALSE FALSE FALSE FALSE TRUE TRUE
Defining SPSS-style user-defined missing values
Main functions
The two main functions in labelled package for manipulating SPSS style user-defined missing values are na_values and na_range.
library(labelled)
v1 <-c(1,2,2,2,5,6)
val_labels(v1) <- c(agree=1, disagree=2, dk=5, refused=6)
na_values(v1) <- 5:6
v1
<Labelled SPSS double>
[1] 1 2 2 2 5 6
Missing values: 5, 6
Labels:
value label
1 agree
2 disagree
5 dk
6 refused
set_* functions
The set_* functions in labelled are intended to be used with magrittr / dplyr.
library(dplyr)
d <- tibble(v1 = c(1, 2, 2, 2, 5, 6), v2 = c(1:3, 1:3))
d <- d %>%
set_value_labels(v1 = c(agree=1, disagree=2, dk=5, refused=6)) %>%
set_na_values(v1 = 5:6)
d$v1
<Labelled SPSS double>
[1] 1 2 2 2 5 6
Missing values: 5, 6
Labels:
value label
1 agree
2 disagree
5 dk
6 refused
What are user-defined missing values?
User-defined missing values are just and only meta-information. It doesn't change the values in a vector. This is simply a way to say to the user that these values could/should be considered in some context as missing values. It means that if you compute something (e.g. mean) from your vector, these values will still be taken into account.
mean(v1)
[1] 3
You can easily convert user-defined missing values to proper NA with user_na_to_na.
mean(user_na_to_na(v1), na.rm = TRUE)
[1] 1.75
There are very few functions that would take into account these meta-information. See for example the freq function from questionr package.
library(questionr)
freq(v1)
n % val%
[1] agree 1 16.7 25
[2] disagree 3 50.0 75
[5] dk 1 16.7 NA
[6] refused 1 16.7 NA
NA 0 0.0 NA
What is the difference with tagged NAs ?
The purpose of tagged NAs, introduced by haven, is to reproduce the way Stata is managing missing values. All tagged NAs are internally considered as NA by R.
Is this correct?
#Using replace to substitute 5 and 6 in v3 with NA
data.frame(lapply(v3, function(a) replace(x = a, list = a %in% c(5,6), values = NA)))
# x x.1
#1 1 1
#2 2 2
#3 2 2
#4 2 2
#5 NA NA
#6 NA NA
I know labelled_spss allows you to specify na_range or even a vector of na_values
#DATA
v11 = labelled_spss(x = c(1,2,2,2,5,6),
labels = c(agree=1, disagree=2, dk=5, refused=6),
na_range = 5:6)
#Check if v11 has NA values
is.na(v11)
#[1] FALSE FALSE FALSE FALSE TRUE TRUE
v22 = labelled_spss(x = c(1,2,2,2,5,6),
labels = c(agree=1, disagree=2, dk=5, refused=6),
na_range = 5:6)
#Put v11 and v22 in a list
v33 = list(v11, v22)
#Use replace like above
data.frame(lapply(X = v33, FUN = function(a) replace(x = a, list = is.na(a), values = NA)))
# x x.1
#1 1 1
#2 2 2
#3 2 2
#4 2 2
#5 NA NA
#6 NA NA
The first argument to set_na_values is a data frame, not a vector/column, which is why your lapply command doesn't work. You could build a list of the arguments for set_na_values for an arbitrary number of columns in your data frame and then call it with do.call as below...
v1<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v2<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v3<-data.frame(v1=v1, v2=v2)
na_values(v3)
args <- c(list(.data = v3), setNames(lapply(names(v3), function(x) c(5,6)), names(v3)))
v3 <- do.call(set_na_values, args)
na_values(v3)
Update: You can also use the assignment form of the na_values function within an lapply statement, since it accepts a vector as it's first argument instead of a data frame like set_na_values...
library(haven)
library(labelled)
v1<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v2<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v3<-data.frame(v1=v1, v2=v2)
lapply(v3, val_labels)
na_values(v3)
v3[] <- lapply(v3, function(x) `na_values<-`(x, c(5,6)))
na_values(v3)
or even use the normal version of na_values in the lapply command, just making sure to return the 'fixed' vector...
library(haven)
library(labelled)
v1<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v2<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v3<-data.frame(v1=v1, v2=v2)
lapply(v3, val_labels)
na_values(v3)
v3[] <- lapply(v3, function(x) { na_values(x) <- c(5,6); x } )
na_values(v3)
and that idea can be used inside of a dplyr chain as well, either applying to all variables, or applying to whatever columns are selected using dplyr's selection tools...
library(haven)
library(labelled)
library(dplyr)
v1<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v2<-labelled(c(1,2,2,2,5,6), c(agree=1, disagree=2, dk=5, refused=6))
v3<-data.frame(v1=v1, v2=v2)
lapply(v3, val_labels)
na_values(v3)
v4 <- v3 %>% mutate_all(funs(`na_values<-`(., c(5,6))))
na_values(v4)
v5 <- v3 %>% mutate_each(funs(`na_values<-`(., c(5,6))), x)
na_values(v5)
You could use a very simple solution in using base R:
v3[v3 == 5 ] <- NA
v3[v3 == 6 ] <- NA
But if you're looking for a really fast solution, you can use a data.table approach.
library(data.table)
setDT(v3)
for(j in seq_along(v3)) {
set(v3, i=which(v3[[j]] %in% c(5,6)), j=j, value=NA)
}
I have a dataset with 500 000 entries. Each entry in it has a userId and a productId. I want to get all userIds corresponding to each distinct productIds. But the list is to huge that none of the following method works for me, it's going very slow. Is there any faster solution.
Using lapply: (Problem: Traversing the whole rpid list for each uniqPids elements)
orderedIndx <- lapply(uniqPids, function(x){
which(rpid %in% x)
})
names(orderedIndx) <- uniqPids
#Looking for indices with each unique productIds
Using For loop:
orderedIndx <- list()
for(j in 1:length(rpid)){
existing <- length(orderedIndx[rpid[j]])
orderedIndx[rpid[j]][existing + 1] <- j
}
Sample Data:
ruid[1:10]
# [1] "a3sgxh7auhu8gw" "a1d87f6zcve5nk" "abxlmwjixxain" "a395borc6fgvxv" "a1uqrsclf8gw1t" "adt0srk1mgoeu"
[7] "a1sp2kvkfxxru1" "a3jrgqveqn31iq" "a1mzyo9tzk0bbi" "a21bt40vzccyt4"
rpid[1:10]
# [1] "b001e4kfg0" "b001e4kfg0" "b000lqoch0" "b000ua0qiq" "b006k2zz7k" "b006k2zz7k" "b006k2zz7k" "b006k2zz7k"
[9] "b000e7l2r4" "b00171apva"
Output should be like:
b001e4kfg0 -> a3sgxh7auhu8gw, a1d87f6zcve5nk
b000lqoch0 -> abxlmwjixxain
b000ua0qiq -> a395borc6fgvxv
b006k2zz7k -> a1uqrsclf8gw1t, adt0srk1mgoeu, a1sp2kvkfxxru1, a3jrgqveqn31iq
b000e7l2r4 -> a1mzyo9tzk0bbi
b00171apva -> a21bt40vzccyt4
It seems perhaps you're just looking for split?
split(seq_along(rpid), rpid)
Not exactly sure what type of output you want, or how many rows you have in your dataset, but I'd suggest 3 versions and you can chose the one you like. First version uses dplyr and character values for your variables. I expect this to be slow if you have millions of rows. Second version uses dplyr but factor variables. I expect this to be faster than the previous one. Third version uses data.table. I expect this to be equally fast, or faster than the second version.
library(dplyr)
ruid =
c("a3sgxh7auhu8gw", "a1d87f6zcve5nk", "abxlmwjixxain", "a395borc6fgvxv",
"a1uqrsclf8gw1t", "adt0srk1mgoeu", "a1sp2kvkfxxru1", "a3jrgqveqn31iq",
"a1mzyo9tzk0bbi", "a21bt40vzccyt4")
rpid =
c("b001e4kfg0", "b001e4kfg0", "b000lqoch0", "b000ua0qiq", "b006k2zz7k",
"b006k2zz7k", "b006k2zz7k", "b006k2zz7k", "b000e7l2r4", "b00171apva")
### using dplyr and character values
dt = data.frame(rpid, ruid, stringsAsFactors = F)
dt %>%
group_by(rpid) %>%
do(data.frame(list_ruids = paste(c(.$ruid), collapse=", "))) %>%
ungroup
# rpid list_ruids
# (chr) (chr)
# 1 b000e7l2r4 a1mzyo9tzk0bbi
# 2 b000lqoch0 abxlmwjixxain
# 3 b000ua0qiq a395borc6fgvxv
# 4 b00171apva a21bt40vzccyt4
# 5 b001e4kfg0 a3sgxh7auhu8gw, a1d87f6zcve5nk
# 6 b006k2zz7k a1uqrsclf8gw1t, adt0srk1mgoeu, a1sp2kvkfxxru1, a3jrgqveqn31iq
# ----------------------------------
### using dplyr and factor values
dt = data.frame(rpid, ruid, stringsAsFactors = T)
dt %>%
group_by(rpid) %>%
do(data.frame(list_ruids = paste(c(levels(dt$ruid)[.$ruid]), collapse=", "))) %>%
ungroup
# rpid list_ruids
# (fctr) (chr)
# 1 b000e7l2r4 a1mzyo9tzk0bbi
# 2 b000lqoch0 abxlmwjixxain
# 3 b000ua0qiq a395borc6fgvxv
# 4 b00171apva a21bt40vzccyt4
# 5 b001e4kfg0 a3sgxh7auhu8gw, a1d87f6zcve5nk
# 6 b006k2zz7k a1uqrsclf8gw1t, adt0srk1mgoeu, a1sp2kvkfxxru1, a3jrgqveqn31iq
# -------------------------------------
library(data.table)
### using data.table
dt = data.table(rpid, ruid)
dt[, list(list_ruids = paste(c(ruid), collapse=", ")), by = rpid]
# rpid list_ruids
# 1: b001e4kfg0 a3sgxh7auhu8gw, a1d87f6zcve5nk
# 2: b000lqoch0 abxlmwjixxain
# 3: b000ua0qiq a395borc6fgvxv
# 4: b006k2zz7k a1uqrsclf8gw1t, adt0srk1mgoeu, a1sp2kvkfxxru1, a3jrgqveqn31iq
# 5: b000e7l2r4 a1mzyo9tzk0bbi
# 6: b00171apva a21bt40vzccyt4
Do you have tidy data in a dataframe? Then you can do this.
library(dplyr)
df %>%
select(productId, userId) %>%
distinct