"Collapsing" data columns - r

can I please get some help wrangling this messy dataset?
The following reprex describes treatments + start/stop dates for five patients. The columns Line1, Line2, Line3 describe the order the treatments were administered (i.e. first treatment, second treatment, etc). However, you can see that the data has been entered such that certain patients don’t have their first treatment in the fist column. For example, ID3's first tx was TreatmentD, but it has been entered into Line3. To complicate matters, some columns have been skipped altogether between consecutive treatments (e.g. ID4).
original_data <- data.frame(
stringsAsFactors = FALSE,
patient_id = c("ID1", "ID2", "ID3", "ID4", "ID5"),
Line1_name = c("TreatmentA", NA, NA, NA, NA),
Line1_start = c("5/5/17", NA, NA, NA, NA),
Line1_stop = c("18/8/17", NA, NA, NA, NA),
Line2_name = c("TreatmentF","TreatmentB",
NA,"TreatmentB","TreatmentF"),
Line2_start = c("6/11/18", "6/6/18", NA, "3/9/18", "15/11/18"),
Line2_stop = c("19/12/18", "12/12/18", NA, "22/2/19", "15/6/19"),
Line3_name = c("TreatmentC", NA, "TreatmentD", NA, "TreatmentC"),
Line3_start = c("13/2/19", NA, "24/11/17", NA, "29/6/19"),
Line3_stop = c("2/4/19", NA, "3/4/18", NA, "15/9/19"),
Line4_name = c(NA, NA, NA, "TreatmentA", NA),
Line4_start = c(NA, NA, NA, "22/2/19", NA),
Line4_stop = c(NA, NA, NA, "8/7/19", NA),
Line5_name = c(NA, NA, NA, NA, "TreatmentE"),
Line5_start = c(NA, NA, NA, NA, "15/1/20"),
Line5_stop = c(NA, NA, NA, NA, "20/5/20")
)
head(original_data)
#> patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start
#> 1 ID1 TreatmentA 5/5/17 18/8/17 TreatmentF 6/11/18
#> 2 ID2 <NA> <NA> <NA> TreatmentB 6/6/18
#> 3 ID3 <NA> <NA> <NA> <NA> <NA>
#> 4 ID4 <NA> <NA> <NA> TreatmentB 3/9/18
#> 5 ID5 <NA> <NA> <NA> TreatmentF 15/11/18
#> Line2_stop Line3_name Line3_start Line3_stop Line4_name Line4_start
#> 1 19/12/18 TreatmentC 13/2/19 2/4/19 <NA> <NA>
#> 2 12/12/18 <NA> <NA> <NA> <NA> <NA>
#> 3 <NA> TreatmentD 24/11/17 3/4/18 <NA> <NA>
#> 4 22/2/19 <NA> <NA> <NA> TreatmentA 22/2/19
#> 5 15/6/19 TreatmentC 29/6/19 15/9/19 <NA> <NA>
#> Line4_stop Line5_name Line5_start Line5_stop
#> 1 <NA> <NA> <NA> <NA>
#> 2 <NA> <NA> <NA> <NA>
#> 3 <NA> <NA> <NA> <NA>
#> 4 8/7/19 <NA> <NA> <NA>
#> 5 <NA> TreatmentE 15/1/20 20/5/20
Question: is there a way I can “collapse” down the data such that no columns are skipped, and all data is “left-shifted” to the earliest treatment # slot that is empty? I tried the using the dplyr::coalesce() function, but while I can coalesce Line2 into Line1, I can’t coalesce Line3 into Line2, because the original Line2 contents is still present (sorry, bit hard to explain). I feel like it probably is the right function though…
My goal is to get to this:
final_data <- data.frame(
stringsAsFactors = FALSE,
patient_id = c("ID1", "ID2", "ID3", "ID4", "ID5"),
Line1_name = c("TreatmentA", "TreatmentB", "TreatmentD", "TreatmentB", "TreatmentF"),
Line1_start = c("5/5/17", "6/6/18", "24/11/17", "3/9/18", "15/11/18"),
Line1_stop = c("18/8/17", "12/12/18", "3/4/18", "22/2/19", "15/6/19"),
Line2_name = c("TreatmentF",NA,
NA,"TreatmentA","TreatmentC"),
Line2_start = c("6/11/18", NA, NA, "22/2/19", "29/6/19"),
Line2_stop = c("19/12/18", NA, NA, "8/7/19", "15/9/19"),
Line3_name = c("TreatmentC", NA, NA, NA, "TreatmentE"),
Line3_start = c("13/2/19", NA, NA, NA, "15/1/20"),
Line3_stop = c("2/4/19", NA, NA, NA, "20/5/20"),
Line4_name = c(NA, NA, NA, NA, NA),
Line4_start = c(NA, NA, NA, NA, NA),
Line4_stop = c(NA, NA, NA, NA, NA),
Line5_name = c(NA, NA, NA, NA, NA),
Line5_start = c(NA, NA, NA, NA, NA),
Line5_stop = c(NA, NA, NA, NA, NA)
)
head(final_data)
#> patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start
#> 1 ID1 TreatmentA 5/5/17 18/8/17 TreatmentF 6/11/18
#> 2 ID2 TreatmentB 6/6/18 12/12/18 <NA> <NA>
#> 3 ID3 TreatmentD 24/11/17 3/4/18 <NA> <NA>
#> 4 ID4 TreatmentB 3/9/18 22/2/19 TreatmentA 22/2/19
#> 5 ID5 TreatmentF 15/11/18 15/6/19 TreatmentC 29/6/19
#> Line2_stop Line3_name Line3_start Line3_stop Line4_name Line4_start
#> 1 19/12/18 TreatmentC 13/2/19 2/4/19 NA NA
#> 2 <NA> <NA> <NA> <NA> NA NA
#> 3 <NA> <NA> <NA> <NA> NA NA
#> 4 8/7/19 <NA> <NA> <NA> NA NA
#> 5 15/9/19 TreatmentE 15/1/20 20/5/20 NA NA
#> Line4_stop Line5_name Line5_start Line5_stop
#> 1 NA NA NA NA
#> 2 NA NA NA NA
#> 3 NA NA NA NA
#> 4 NA NA NA NA
#> 5 NA NA NA NA
Thanks!

Interesting question!
library(dplyr)
library(tidyr)
original_data %>%
pivot_longer(starts_with("Line"),
names_sep = "_",
names_to = c("line", "prop"),
values_drop_na = T) %>%
group_by(patient_id) %>%
mutate(line = as.numeric(factor(line))) %>%
ungroup() %>%
mutate(name = sprintf("Line%s_%s", line, prop)) %>%
pivot_wider(id_cols = patient_id, names_from = name, values_from = value)
returns:
# A tibble: 5 x 10
patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start Line2_stop
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID1 TreatmentA 5/5/17 18/8/17 TreatmentF 6/11/18 19/12/18
2 ID2 TreatmentB 6/6/18 12/12/18 NA NA NA
3 ID3 TreatmentD 24/11/17 3/4/18 NA NA NA
4 ID4 TreatmentB 3/9/18 22/2/19 TreatmentA 22/2/19 8/7/19
5 ID5 TreatmentF 15/11/18 15/6/19 TreatmentC 29/6/19 15/9/19
# … with 3 more variables: Line3_name <chr>, Line3_start <chr>,
# Line3_stop <chr>

You can shift all the non-NA values to the left -
original_data[] <- t(apply(original_data, 1,function(x) na.omit(x)[1:length(x)]))
# patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start Line2_stop
#1 ID1 TreatmentA 5/5/17 18/8/17 TreatmentF 6/11/18 19/12/18
#2 ID2 TreatmentB 6/6/18 12/12/18 <NA> <NA> <NA>
#3 ID3 TreatmentD 24/11/17 3/4/18 <NA> <NA> <NA>
#4 ID4 TreatmentB 3/9/18 22/2/19 TreatmentA 22/2/19 8/7/19
#5 ID5 TreatmentF 15/11/18 15/6/19 TreatmentC 29/6/19 15/9/19
# Line3_name Line3_start Line3_stop Line4_name Line4_start Line4_stop Line5_name
#1 TreatmentC 13/2/19 2/4/19 <NA> <NA> <NA> <NA>
#2 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#3 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#4 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#5 TreatmentE 15/1/20 20/5/20 <NA> <NA> <NA> <NA>
# Line5_start Line5_stop
#1 <NA> <NA>
#2 <NA> <NA>
#3 <NA> <NA>
#4 <NA> <NA>
#5 <NA> <NA>

Related

How to filter only subjects observed more than once in panel data with R?

I am analyzing panel data with R now, and the data format is as follows.
pid wave edu marri rela age apt sido dongy urban stat1 stat2 exer dep3 bmi mmse
1 3122 1 2 <NA> NA NA <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
2 3122 1 NA 1 NA NA <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
3 3122 1 NA <NA> 3 NA <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
4 3122 1 NA <NA> NA 71 <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
5 3122 1 NA <NA> NA NA 1 NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
6 3122 1 NA <NA> NA NA <NA> 11 <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
The data are repeated measurements, and there are many missing values. If only the observed values are left at every year, the loss of the number is large, so I want to select and analyze only subjects who have been measured more than once among the 'mmse' variables.
I tried to check the change of the variable of interest through the following code, but it didn't work.
df %>%
arrange(pid, wave) %>%
group_by(pid) %>%
mutate(
mmse_change = mmse - lag(mmse),
mmse_increase = mmse_change > 0,
mmse_decrease = mmse_change < 0
)
I need the above object to analyze the baseline characteristic. How can I extract subjects with this condition?
We could do something like this:
df %>%
filter(!is.na(mmse)) %>% # just keep rows with non-NA in mmse
count(pid) %>% # count how many observations per pid
filter(n > 1) %>% # keep those pid's appearing more than once
select(pid) %>% # just keep the pid column
left_join(df) # get `df` for just those pid's
Another approach without join is to group_by(pid) and then filter all groups where max(row_number()) > 1.
Below I changed your initial data so that it can be used for this problem (your original data has only NAs in mmse and please put your data in reproducible code next).
library(tidyverse)
# initial data slightly changed:
df <- tribble(~pid, ~wave, ~edu, ~marri, ~rela, ~age, ~apt, ~sido, ~dongy, ~urban, ~stat1, ~stat2, ~exer, ~dep3, ~bmi, ~mmse,
3122 , 1, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1,
3122 , 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
3122 , 1, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2,
3122 , 1, NA, NA, NA, 71, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
3122 , 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, 3,
3124 , 1, NA, NA, NA, NA, NA, 11, NA, NA, NA, NA, NA, NA, NA, 5)
df %>%
filter(!is.na(mmse)) %>%
group_by(pid) %>%
filter(max(row_number()) > 1) %>%
ungroup()
#> # A tibble: 3 x 16
#> pid wave edu marri rela age apt sido dongy urban stat1 stat2 exer
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 3122 1 2 NA NA NA NA NA NA NA NA NA NA
#> 2 3122 1 NA NA 3 NA NA NA NA NA NA NA NA
#> 3 3122 1 NA NA NA NA 1 NA NA NA NA NA NA
#> # ... with 3 more variables: dep3 <lgl>, bmi <lgl>, mmse <dbl>
Created on 2022-09-21 by the reprex package (v2.0.1)

What is the fastest way to add new column based on dataframe entries in specific columns

So I have this dataframe
# Name Comp1 Con2 Vis3 Tra4 Pred5 Adap6
# 1 A1 x <NA> <NA> <NA> <NA> <NA>
# 2 A2 <NA> x <NA> <NA> <NA> <NA>
# 3 B1 <NA> <NA> x <NA> <NA> <NA>
# 4 B2 <NA> <NA> <NA> <NA> x <NA>
# 5 B3 <NA> <NA> <NA> x <NA> <NA>
# 6 D2 <NA> <NA> <NA> <NA> <NA> x
# 7 F6 <NA> <NA> <NA> <NA> x <NA>
I want to add a column to databackend that displays a value from 1 to 6 based on which column the "x" is on databackend. So the additional column would look like this
# Name Comp1 Con2 Vis3 Tra4 Pred5 Adap6 stage
# 1 A1 x <NA> <NA> <NA> <NA> <NA> 1
# 2 A2 <NA> x <NA> <NA> <NA> <NA> 2
# 3 B1 <NA> <NA> x <NA> <NA> <NA> 3
# 4 B2 <NA> <NA> <NA> <NA> x <NA> 5
# 5 B3 <NA> <NA> <NA> x <NA> <NA> 4
# 6 D2 <NA> <NA> <NA> <NA> <NA> x 6
# 7 F6 <NA> <NA> <NA> <NA> x <NA> 5
since my dataframe is very large in the original script, I am looking for the fastest (automatic) way to do this. I´ve tried using the for loop but it takes too long.
data
databackend <- structure(list(Name = c("A1", "A2", "B1", "B2", "B3", "D2", "F6"
), Comp1 = c("x", NA, NA, NA, NA, NA, NA), Con2 = c(NA, "x",
NA, NA, NA, NA, NA), Vis3 = c(NA, NA, "x", NA, NA, NA, NA), Tra4 = c(NA,
NA, NA, NA, "x", NA, NA), Pred5 = c(NA, NA, NA, "x", NA, NA,
"x"), Adap6 = c(NA, NA, NA, NA, NA, "x", NA), stage = c(1, 2,
3, 5, 4, 6, 5)), row.names = c(NA, -7L), class = "data.frame")
You can do (assuming as in your example a single "x" in every row):
max.col(!is.na(databackend[-1]))
[1] 1 2 3 5 4 6 5
Relatively simple
> tmp=which(databackend[,-1]=="x",arr.ind=T)
> tmp[order(tmp[,"row"]),"col"]
[1] 1 2 3 5 4 6 5
Using which and apply:
apply(databackend[-1], 1, \(x) which(x == "x"))
#[1] 1 2 3 5 4 6 5
A benchmark, max.col is the fastest:
microbenchmark::microbenchmark(
apply = apply(databackend[-1], 1, \(x) which(x == "x")),
which = {tmp=which(databackend[,-1]=="x",arr.ind=T)
tmp[order(tmp[,"row"]),"col"]},
max.col = max.col(!is.na(databackend[-1]))
)
Unit: microseconds
expr min lq mean median uq max neval
apply 149.4 165.95 232.308 196.20 216.95 2882.4 100
which 118.9 144.35 184.684 158.10 190.45 907.0 100
max.col 51.5 73.00 88.302 79.45 94.40 326.1 100
We can try
> rowSums(col(databackend[-1])*(!is.na(databackend[-1])))
[1] 1 2 3 5 4 6 5

How to remove column(s) if a row contains a value?

I have seen lots of posts on how to remove rows if user specified columns contain a certain string.
I want to do the reverse and generalise it. I want to remove every column if any row in that column contains a certain string. (To compare with Excel, I would find all cells containing a given string and then delete every column.)
How can I do this? I was thinking with dplyr and filter, but I have to specify columns I think, or at least the way I would know how to approach it. But I have 300 odd columns and almost 4000 rows.
EDIT: Here is a sample of my dataframe.
# A tibble: 6 x 310
ISIN AU000KFWHAC9 AU3CB0243657 AU3CB0256162 AU3CB0260321 AU3CB0265239 AU3CB0283190 AU3SG0001928 AU3SG0002371
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Timestamp MID_PRICE Mid Price Cl~ Mid Price C~ Mid Price C~ Mid Price C~ Mid Price C~ Mid Price C~ Mid Price C~
2 41275 Invalid RIC. NA NA Invalid RIC. NA Invalid RIC. NA NA
3 41276 NA NA NA NA NA NA NA NA
4 41277 NA NA NA NA 3 NA NA NA
5 41278 NA NA NA NA NA NA NA NA
6 41279 5 NA 4 NA NA NA NA NA
So as you can see, the dataframe is full of lots of NA's. I am unsure if this will affect some functions' ability.
With a dataframe of:
> df <- data.frame(a=c("a", "b", "c"), b=c("bad string", "d", "e"), c=c("f", "g", "h"))
> df
a b c
1 a bad string f
2 b d g
3 c e h
>
Use colSums:
> df[, !colSums(df == "bad string")]
a c
1 a f
2 b g
3 c h
>
Only keep columns where colSums is 0.
You can grep your search:
dat[,-grep("Invalid", dat)]
ISIN AU3CB0243657 AU3CB0256162 AU3CB0265239 AU3SG0001928 AU3SG0002371
1 Timestamp MidPriceC~ MidPriceC~ MidPriceC~ MidPriceC~ MidPriceC~
2 41275 <NA> <NA> <NA> <NA> <NA>
3 41276 <NA> <NA> <NA> <NA> <NA>
4 41277 <NA> <NA> 3 <NA> <NA>
5 41278 <NA> <NA> <NA> <NA> <NA>
6 41279 <NA> 4 <NA> <NA> <NA>
Data:
dat <- structure(list(ISIN = c("Timestamp", "41275", "41276", "41277",
"41278", "41279"), AU000KFWHAC9 = c("MID_PRICE", "Invalid_RIC.",
NA, NA, NA, "5"), AU3CB0243657 = c("MidPriceC~", NA, NA, NA,
NA, NA), AU3CB0256162 = c("MidPriceC~", NA, NA, NA, NA, "4"),
AU3CB0260321 = c("MidPriceC~", "Invalid_RIC.", NA, NA, NA,
NA), AU3CB0265239 = c("MidPriceC~", NA, NA, "3", NA, NA),
AU3CB0283190 = c("MidPriceC~", "Invalid_RIC.", NA, NA, NA,
NA), AU3SG0001928 = c("MidPriceC~", NA, NA, NA, NA, NA),
AU3SG0002371 = c("MidPriceC~", NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-6L))
A solution using dplyr. We can use select and where to apply a function to check if a column contains a certain string or not. dat is from Andre Wildberg's answer.
library(dplyr)
dat2 <- dat %>%
select(where(function(x) all(!grepl("Invalid", x))))
dat2
# ISIN AU3CB0243657 AU3CB0256162 AU3CB0265239 AU3SG0001928 AU3SG0002371
# 1 Timestamp MidPriceC~ MidPriceC~ MidPriceC~ MidPriceC~ MidPriceC~
# 2 41275 <NA> <NA> <NA> <NA> <NA>
# 3 41276 <NA> <NA> <NA> <NA> <NA>
# 4 41277 <NA> <NA> 3 <NA> <NA>
# 5 41278 <NA> <NA> <NA> <NA> <NA>
# 6 41279 <NA> 4 <NA> <NA> <NA>

Get element position out of matrix1 in order to remove every element with the founded position in matrix2

i have the following test-code here
test1 = tibble::tribble(
~name1, ~name2, ~name3, ~name4, ~name5,
"C452", NA, NA, NA, NA,
"D622", "M245", NA, NA, NA,
"J533", "J625", NA, NA, NA,
"F226", "L325", NA, NA, NA,
"B565", "F226", NA, NA, NA,
"W342", "DUPLICATE", NA, NA, NA,
"H452", "K632", "L553", "DUPLICATE", "R551",
"C636", "J245", "DUPLICATE", NA, NA,
"H525", NA, NA, NA, NA
)
test2 = tibble::tribble(
~name1, ~name2, ~name3, ~name4, ~name5,
"MICHAEL", NA, NA, NA, NA,
"ALEXANDER", "GEORGE", NA, NA, NA,
"CHUN", "KNAPPWOST", NA, NA, NA,
"CIRO", "SIMON", NA, NA, NA,
"ARMIN", "CIRO", NA, NA, NA,
"JUERGEN", "JUERGEN", NA, NA, NA,
"EDWARD", "PHILIPP", "TRU", "CHRISTOPHER", "VICTOR",
"RAPHAEL", "CHRISTOPH", "JAMES", NA, NA,
"NILES", NA, NA, NA, NA
)
###GETS THE LIST OF ELEMENTS THAT ARE DUPLICATES
position = grep('DUPLICATE',test1)
###THAT IS WHAT I WANT
gsub(position, 'DUPLICATE', test2)
I always get a warning and don't know how to fix that.
At the end I want to go threw lets say a big matrix1, find the position of every 'DUPLICATE' and remove elements in matrix2 with the exact position (row,column) from the 'DUPLICATES' found in matrix1.
Probably really easy but I cannot figure it out somehow.
This is easy to do if you convert the data to dataframes.
test1 <- data.frame(test1)
test2 <- data.frame(test2)
test1[test1 == 'DUPLICATE' & !is.na(test1)] <- test2[test1 == 'DUPLICATE' & !is.na(test1)]
test1
# name1 name2 name3 name4 name5
#1 C452 <NA> <NA> <NA> <NA>
#2 D622 M245 <NA> <NA> <NA>
#3 J533 J625 <NA> <NA> <NA>
#4 F226 L325 <NA> <NA> <NA>
#5 B565 F226 <NA> <NA> <NA>
#6 W342 JUERGEN <NA> <NA> <NA>
#7 H452 K632 L553 CHRISTOPHER R551
#8 C636 J245 JAMES <NA> <NA>
#9 H525 <NA> <NA> <NA> <NA>
We can use dplyr and there is no need to convert back to data.frame. Just use mutate with across, extract the corresponding column from 'test2' based on the column name (cur_column()) and change the elements that are 'DUPLICATE' with the corresponding 'test2' column in case_when
library(dplyr)
test1 <- test1 %>%
mutate(across(everything(),
~ case_when(. %in% 'DUPLICATE' ~ test2[[cur_column()]], TRUE ~ .)))
-output
test1
# A tibble: 9 x 5
name1 name2 name3 name4 name5
<chr> <chr> <chr> <chr> <chr>
1 C452 <NA> <NA> <NA> <NA>
2 D622 M245 <NA> <NA> <NA>
3 J533 J625 <NA> <NA> <NA>
4 F226 L325 <NA> <NA> <NA>
5 B565 F226 <NA> <NA> <NA>
6 W342 JUERGEN <NA> <NA> <NA>
7 H452 K632 L553 CHRISTOPHER R551
8 C636 J245 JAMES <NA> <NA>
9 H525 <NA> <NA> <NA> <NA>

How to locate a structured region of data inside of a not structured data frame in R?

I have a certain kind of data frames that contain a subset of interest. The problem is that this subset, is non consistent between the different data frames. Nonetheless, in a more abstract level, follows a general structure: a rectangular region inside the data frame.
example1 <- data.frame(x = c("name", "129-2", NA, NA, "acc", 2, 3, 4, NA, NA),
y = c(NA, NA, NA, NA, "deb", 3, 2, 5, NA, NA),
z = c(NA, NA, NA, NA, "asset", 1, 1, 2, NA, NA))
print(example1)
x y z
1 name <NA> <NA>
2 129-2 <NA> <NA>
3 <NA> <NA> <NA>
4 <NA> <NA> <NA>
5 acc deb asset
6 2 3 1
7 3 2 1
8 4 5 2
9 <NA> <NA> <NA>
10 <NA> <NA> <NA>
The example1 contain a clear rectangular región with a structure information:
5 acc deb asset
6 2 3 1
7 3 2 1
8 4 5 2
As mentioned before, the region is not always consistent,
the position of the columns are not always the same
the name of the variables insde the subset of interest are not always the same
Here another example2:
example2 <- data.frame(x = c("name", "129-2", "wallabe #23", NA, NA, "acc", 2, 3, 4, NA ),
y = c(NA, NA, NA, NA, "balance", "deb", 3, 2, 5, NA),
z = c(NA, NA, NA, NA, NA, "asset", 1, 1, 2, NA),
u = c(NA, NA, NA, "currency:", NA, NA, NA, NA, NA, NA),
i = c(NA, NA, NA, "USD", "result", "win", 2, 3, 1, NA),
o = c(NA, NA, NA, NA, NA, "lose", 2, 2, 1, NA))
print(example2)
> example2
x y z u i o
1 name <NA> <NA> <NA> <NA> <NA>
2 129-2 <NA> <NA> <NA> <NA> <NA>
3 wallabe #23 <NA> <NA> <NA> <NA> <NA>
4 <NA> <NA> <NA> currency: USD <NA>
5 <NA> balance <NA> <NA> result <NA>
6 acc deb asset <NA> win lose
7 2 3 1 <NA> 2 2
8 3 2 1 <NA> 3 2
9 4 5 2 <NA> 1 1
10 <NA> <NA> <NA> <NA> <NA> <NA>
The example2 contain a not clear rectangular región:
6 acc deb asset <NA> win lose
7 2 3 1 <NA> 2 2
8 3 2 1 <NA> 3 2
9 4 5 2 <NA> 1 1
One method to scan this dataframe to locate this kind of region inside of it?
Any idea is appreciated
You might want to try the longest sequence with same amount of NAs:
findTable <- function(df){
naSeq <- rowSums(is.na(df)) # How many NA per row
myRle <- rle(naSeq )$length # Find sequences length
df[rep(myRle == max(myRle), myRle),] # Get longest sequence
}
findTable(example1)
x y z
5 acc deb asset
6 2 3 1
7 3 2 1
8 4 5 2
findTable(example2)
x y z u i o
6 acc deb asset <NA> win lose
7 2 3 1 <NA> 2 2
8 3 2 1 <NA> 3 2
9 4 5 2 <NA> 1 1

Resources