Replace string based on partial match - tidyverse - r

I have a column that are characters. There are observations where ph was added to the end of the string. Any string in the column that has ph in the string I want replaced with "NA". Here is what I have tried and gives the below error.
Column
wind_speed_m_s <- c("1.7", "0.7", "0", "0.6", "0.4", "1.2", "1.9", "1.3", "2.0, gust to 3.7",
"0.5", "1.8", "1.4", "3.4", "2.8", "1.6", "2", NA, "0.9", "0.8",
"1", "1.1", "2.6", "2.4", "1.1ph", "1.7 kt", "2.1", "1.5", "0ph",
"3", ".4 /s", "0.3", "2.3", "0.2", "3.3", "3.9ph", "1.5ph", "1ph",
"2ph", "1.7ph", "0.8 ph", "1.5 ph", "2.2", "1.9 k/hr", "2.5",
"NA", "0.4/s", "1/s")
date <- data_raw %>%
mutate(wind_speed_m_s = str_replace(wind_speed_m_s, pattern = str_detect("ph"), "NA"))
Error in `mutate()`:
! Problem while computing `wind_speed_m_s = str_replace(wind_speed_m_s, pattern = str_detect("ph"), "NA")`.
Caused by error in `type()`:
! argument "pattern" is missing, with no default
Backtrace:
1. ... %>% ...
9. stringr::str_detect("ph")
10. stringr:::type(pattern)

We may use str_detect within case_when. In the OP's code, it had only a single argument i.e. pattern and without the data
library(dplyr)
library(stringr)
data_raw %>%
mutate(wind_speed_m_s = case_when(str_detect(wind_speed_m_s, "ph",
negate = TRUE)~ wind_speed_m_s))
-output
wind_speed_m_s
1 1.7
2 0.7
3 0
4 0.6
5 0.4
6 1.2
7 1.9
8 1.3
9 2.0, gust to 3.7
10 0.5
11 1.8
12 1.4
13 3.4
14 2.8
15 1.6
16 2
17 <NA>
18 0.9
19 0.8
20 1
21 1.1
22 2.6
23 2.4
24 <NA>
25 1.7 kt
26 2.1
27 1.5
28 <NA>
29 3
30 .4 /s
31 0.3
32 2.3
33 0.2
34 3.3
35 <NA>
36 <NA>
37 <NA>
38 <NA>
39 <NA>
40 <NA>
41 <NA>
42 2.2
43 1.9 k/hr
44 2.5
45 NA
46 0.4/s
47 1/s

You can use sub:
sub(".*ph$", NA, wind_speed_m_s)
[1] "1.7" "0.7" "0"
[4] "0.6" "0.4" "1.2"
[7] "1.9" "1.3" "2.0, gust to 3.7"
[10] "0.5" "1.8" "1.4"
[13] "3.4" "2.8" "1.6"
[16] "2" NA "0.9"
[19] "0.8" "1" "1.1"
[22] "2.6" "2.4" NA
[25] "1.7 kt" "2.1" "1.5"
[28] NA "3" ".4 /s"
[31] "0.3" "2.3" "0.2"
[34] "3.3" NA NA
[37] NA NA NA
[40] NA NA "2.2"
[43] "1.9 k/hr" "2.5" "NA"
[46] "0.4/s" "1/s"
Also can do:
is.na(wind_speed_m_s) <- grepl("ph$", wind_speed_m_s)
Note that $ is needed to indicate the end of the string incase there is another ph in the middle of a string. If you need anything that has ph regardless as to whether its at the end or the middle, just remove the $

We can use grep to identify where the patter is and use it as index for replacement.
> wind_speed_m_s[grep("ph", wind_speed_m_s)] <- NA
> wind_speed_m_s
[1] "1.7" "0.7" "0" "0.6" "0.4" "1.2"
[7] "1.9" "1.3" "2.0, gust to 3.7" "0.5" "1.8" "1.4"
[13] "3.4" "2.8" "1.6" "2" NA "0.9"
[19] "0.8" "1" "1.1" "2.6" "2.4" NA
[25] "1.7 kt" "2.1" "1.5" NA "3" ".4 /s"
[31] "0.3" "2.3" "0.2" "3.3" NA NA
[37] NA NA NA NA NA "2.2"
[43] "1.9 k/hr" "2.5" "NA" "0.4/s" "1/s"

Related

Scraping front page Coinmarketcap into a dataframe

Hi what I want is to be able to get the front page of Coinmarketcap into a dataframe. This is what I got so far but the data looks unorganize and I don't know how to make into a neat df.
library(jsonlite)
library ( tidyverse)
library( rvest )
# lets get what is marketcap today.
json_data <- read_html(c ( 'https://coinmarketcap.com/')) %>%
html_node("#__NEXT_DATA__") %>%
html_text() %>%
fromJSON()
json_data$props$initialState$cryptocurrency$listingLatest$data
What I end up getting is a long list that I cannot make sense of. I know its in there because the list looks something like this but I dont know how to parse this.
121] "quotes.2.percentChange60d" "quotes.2.percentChange7d" "quotes.2.percentChange90d" "quotes.2.price"
[125] "quotes.2.selfReportedMarketCap" "quotes.2.turnover" "quotes.2.volume24h" "quotes.2.volume30d"
[129] "quotes.2.volume7d" "quotes.2.ytdPriceChangePercentage" "rank" "selfReportedCirculatingSupply"
[133] "slug" "symbol" "totalSupply" "tvl"
[[1]]$id
[1] "COMPRESSED_KEYS_ARR"
[[1]]$excludeProps
[1] "auditInfoList"
[[2]]
[1] "68789.6259389221" "65.5260009765625" "18908943" "1" "2013-04-28T00:00:00.000Z"
[6] "TRUE" "FALSE" "50755.7211665326" "1" "1"
[11] "FALSE" "2021-12-23T19:20:02.000Z" "48065.8375264037" "8093" "21000000"
[16] "Bitcoin" "40.4175" "1065349214847.34" "2021-12-23T19:21:02.000Z" "18897342.6115399"
[21] "18897342.6115399" "BTC" "0" "0" "0"
[26] "0" "0" "0" "1" "0"
[31] "0.02793205" "527841.47774037" "21776428.8780472" "3626419.86588612" "72.706"
[36] "40.4175" "1065349214847.34" "2021-12-23T19:21:02.000Z" "232885004.198773" "232885004.198773"
[41] "ETH" "-0.189131" "0.653349" "-11.42415087" "-16.02722155"
[46] "3.129837" "19.93155879" "12.31613021" "0" "0.02793205"
[51] "6504955.07684694" "268365972.663341" "44690876.5456617" "72.706" "40.4175"
[56] "1065349214847.34" "2021-12-23T19:20:02.000Z" "959267979935.385" "959267979935.385" "USD"
[61] "0.53649283" "3.98091259" "-11.42415087" "-16.02722155" "5.84148872"
[66] "19.93155879" "50730.9149927304" "0" "0.02793205" "26794319100.1314"
[71] "1105416320667.99" "184084531389.181" "72.706" "40.4175" "1065349214847.34"
[76] "2021-12-23T19:21:02.000Z" "18897342.6115399" "18897342.6115399" "BTC" "0"
[81] "0" "0" "0" "0" "0"
[86] "1" "0" "0.02793205" "527841.47774037" "21776428.8780472"
[91] "3626419.86588612" "72.706" "40.4175" "1065349214847.34" "2021-12-23T19:21:02.000Z"
[96] "232885004.198773" "232885004.198773" "ETH" "-0.189131" "0.653349"
[101] "-11.42415087" "-16.02722155" "3.129837" "19.93155879" "12.31613021"
[106] "0" "0.02793205" "6504955.07684694" "268365972.663341" "44690876.5456617"
[111] "72.706" "40.4175" "1065349214847.34" "2021-12-23T19:20:02.000Z" "959267979935.385"
[116] "959267979935.385" "USD" "0.53649283" "3.98091259" "-11.42415087"
[121] "-16.02722155" "5.84148872" "19.93155879" "50730.9149927304" "0"
[126] "0.02793205" "26794319100.1314" "1105416320667.99" "184084531389.181" "72.706"
[131] "1" "0" "bitcoin" "BTC" "18908943"
[136] NA NA
[[3]]
[1] "4891.70469755141" "0.420897006988525" "118860687.6865" "2" "2015-08-07T00:00:00.000Z"
[6] "TRUE" "FALSE" "4119.08504574469" "1027" "1"
[11] "FALSE" "2021-12-23T19:20:02.000Z" "3897.23447281111" "4509" NA
[16] "Ethereum" "20.6197" "489234090606.33" "2021-12-23T19:21:02.000Z" "9637790.92058901"
[21] "9637790.92058901" "BTC" "0.277187" "-0.842643" "-4.49917037"
What I eventually want is something like how I can retrieve with the historical data.
json_data <- read_html("https://coinmarketcap.com/historical/20150621/") %>%
html_node("#__NEXT_DATA__") %>%
html_text() %>%
fromJSON()
df_data <- json_data$props$initialState$cryptocurrency$listingHistorical$data
> head ( df_data )
id name symbol slug num_market_pairs date_added tags max_supply circulating_supply total_supply platform.id
1 1 Bitcoin BTC bitcoin NA 2013-04-28T00:00:00.000Z mineable 21000000 14298800 14298800 NA
2 52 XRP XRP xrp NA 2013-08-04T00:00:00.000Z 100000000000 31908551587 99998976018 NA
3 2 Litecoin LTC litecoin NA 2013-04-28T00:00:00.000Z mineable 84000000 40119404 40119404 NA
4 74 Dogecoin DOGE dogecoin NA 2013-12-15T00:00:00.000Z mineable NA 99890370337 99890370337 NA
5 463 BitShares BTS bitshares NA 2014-07-21T00:00:00.000Z 3600570502 2511953117 2511953117 NA
6 512 Stellar XLM stellar NA 2014-08-05T00:00:00.000Z NA 4837354256 100804167862 NA
Use html_table:
library(jsonlite)
library ( tidyverse)
library( rvest )
# lets get what is marketcap today.
json_data <- read_html(c ( 'https://coinmarketcap.com/')) %>%
html_nodes("table") %>% html_table(fill=T)
It returns a table.
> json_data
[[1]]
# A tibble: 100 x 11
`` `#` Name Price `24h %` `7d %` `Market Cap` `Volume(24h)`
<lgl> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 NA 1 Bitcoin1~ $50,77~ 3.61% 5.53% $960.18B$960,~ $28,207,384,9685~
2 NA 2 Ethereum~ $4,104~ 2.18% 1.88% $487.89B$487,~ $17,920,397,7984~
3 NA 3 Binance ~ $548.65 1.94% 2.67% $91.52B$91,51~ $1,860,150,3053,~
4 NA 4 Tether4U~ $1.00 0.04% 0.01% $77.38B$77,38~ $68,556,169,0906~
5 NA 5 Solana5S~ $189.82 4.75% 3.83% $58.55B$58,55~ $2,144,421,38811~
6 NA 6 Cardano6~ $1.47 8.69% 15.79% $49.08B$49,07~ $1,964,583,1431,~
7 NA 7 XRP7XRP $1.01 4.26% 23.58% $47.82B$47,81~ $4,211,885,8344,~
8 NA 8 USD Coin~ $1.00 0.05% 0.05% $42.57B$42,57~ $4,039,920,4424,~
9 NA 9 Terra9LU~ $92.66 3.16% 37.30% $34.02B$34,02~ $4,141,070,96044~
10 NA 10 Avalanch~ $122.16 1.27% 17.38% $29.71B$29,70~ $1,291,116,76510~
# ... with 90 more rows, and 3 more variables: Circulating Supply <chr>,
# Last 7 Days <lgl>, <lgl>

write.csv returns a blank file in R

I have a data set that is in a .Rdata format - something I haven't worked with before. I would like to export the data to a csv or related file for use in Python. I've used "write.csv", "write.table", and a few others and while they all seem like they are writing to the file, when I open it it's completely blank. I've also tried converting the data to a dataframe before exporting with no luck so far.
After importing the file in R, the data is labeled as a Large array (1499904 elements, 11.5 Mb) with the following attributes:
> attributes(data.station)
$`dim`
[1] 12 31 288 7 2
$dimnames
$dimnames[[1]]
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
$dimnames[[2]]
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21"
[22] "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
$dimnames[[3]]
[1] "" "00:05:00" "00:10:00" "00:15:00" "00:20:00" "00:25:00" "00:30:00" "00:35:00" "00:40:00"
[10] "00:45:00" "00:50:00" "00:55:00" "01:00:00" "01:05:00" "01:10:00" "01:15:00" "01:20:00" "01:25:00"
[19] "01:30:00" "01:35:00" "01:40:00" "01:45:00" "01:50:00" "01:55:00" "02:00:00" "02:05:00" "02:10:00"
[28] "02:15:00" "02:20:00" "02:25:00" "02:30:00" "02:35:00" "02:40:00" "02:45:00" "02:50:00" "02:55:00"
[37] "03:00:00" "03:05:00" "03:10:00" "03:15:00" "03:20:00" "03:25:00" "03:30:00" "03:35:00" "03:40:00"
[46] "03:45:00" "03:50:00" "03:55:00" "04:00:00" "04:05:00" "04:10:00" "04:15:00" "04:20:00" "04:25:00"
[55] "04:30:00" "04:35:00" "04:40:00" "04:45:00" "04:50:00" "04:55:00" "05:00:00" "05:05:00" "05:10:00"
[64] "05:15:00" "05:20:00" "05:25:00" "05:30:00" "05:35:00" "05:40:00" "05:45:00" "05:50:00" "05:55:00"
[73] "06:00:00" "06:05:00" "06:10:00" "06:15:00" "06:20:00" "06:25:00" "06:30:00" "06:35:00" "06:40:00"
[82] "06:45:00" "06:50:00" "06:55:00" "07:00:00" "07:05:00" "07:10:00" "07:15:00" "07:20:00" "07:25:00"
[91] "07:30:00" "07:35:00" "07:40:00" "07:45:00" "07:50:00" "07:55:00" "08:00:00" "08:05:00" "08:10:00"
[100] "08:15:00" "08:20:00" "08:25:00" "08:30:00" "08:35:00" "08:40:00" "08:45:00" "08:50:00" "08:55:00"
[109] "09:00:00" "09:05:00" "09:10:00" "09:15:00" "09:20:00" "09:25:00" "09:30:00" "09:35:00" "09:40:00"
[118] "09:45:00" "09:50:00" "09:55:00" "10:00:00" "10:05:00" "10:10:00" "10:15:00" "10:20:00" "10:25:00"
[127] "10:30:00" "10:35:00" "10:40:00" "10:45:00" "10:50:00" "10:55:00" "11:00:00" "11:05:00" "11:10:00"
[136] "11:15:00" "11:20:00" "11:25:00" "11:30:00" "11:35:00" "11:40:00" "11:45:00" "11:50:00" "11:55:00"
[145] "12:00:00" "12:05:00" "12:10:00" "12:15:00" "12:20:00" "12:25:00" "12:30:00" "12:35:00" "12:40:00"
[154] "12:45:00" "12:50:00" "12:55:00" "13:00:00" "13:05:00" "13:10:00" "13:15:00" "13:20:00" "13:25:00"
[163] "13:30:00" "13:35:00" "13:40:00" "13:45:00" "13:50:00" "13:55:00" "14:00:00" "14:05:00" "14:10:00"
[172] "14:15:00" "14:20:00" "14:25:00" "14:30:00" "14:35:00" "14:40:00" "14:45:00" "14:50:00" "14:55:00"
[181] "15:00:00" "15:05:00" "15:10:00" "15:15:00" "15:20:00" "15:25:00" "15:30:00" "15:35:00" "15:40:00"
[190] "15:45:00" "15:50:00" "15:55:00" "16:00:00" "16:05:00" "16:10:00" "16:15:00" "16:20:00" "16:25:00"
[199] "16:30:00" "16:35:00" "16:40:00" "16:45:00" "16:50:00" "16:55:00" "17:00:00" "17:05:00" "17:10:00"
[208] "17:15:00" "17:20:00" "17:25:00" "17:30:00" "17:35:00" "17:40:00" "17:45:00" "17:50:00" "17:55:00"
[217] "18:00:00" "18:05:00" "18:10:00" "18:15:00" "18:20:00" "18:25:00" "18:30:00" "18:35:00" "18:40:00"
[226] "18:45:00" "18:50:00" "18:55:00" "19:00:00" "19:05:00" "19:10:00" "19:15:00" "19:20:00" "19:25:00"
[235] "19:30:00" "19:35:00" "19:40:00" "19:45:00" "19:50:00" "19:55:00" "20:00:00" "20:05:00" "20:10:00"
[244] "20:15:00" "20:20:00" "20:25:00" "20:30:00" "20:35:00" "20:40:00" "20:45:00" "20:50:00" "20:55:00"
[253] "21:00:00" "21:05:00" "21:10:00" "21:15:00" "21:20:00" "21:25:00" "21:30:00" "21:35:00" "21:40:00"
[262] "21:45:00" "21:50:00" "21:55:00" "22:00:00" "22:05:00" "22:10:00" "22:15:00" "22:20:00" "22:25:00"
[271] "22:30:00" "22:35:00" "22:40:00" "22:45:00" "22:50:00" "22:55:00" "23:00:00" "23:05:00" "23:10:00"
[280] "23:15:00" "23:20:00" "23:25:00" "23:30:00" "23:35:00" "23:40:00" "23:45:00" "23:50:00" "23:55:00"
$dimnames[[4]]
[1] "tempinf" "tempf" "humidityin" "humidity" "solarradiation" "hourlyrainin"
[7] "windspeedmph"
$dimnames[[5]]
[1] "2020" "2021"
Any advice on how to handle this? Thank you!
You have to flatten the array to write it. First we create a reproducible example of your data:
x <- 1:(2 * 3 * 4 * 5 * 6)
dnames <- list(LETTERS[1:2], LETTERS[3:5], LETTERS[6:9], LETTERS[10:14], LETTERS[15:20])
y <- array(x, dim=c(2, 3, 4, 5, 6), dimnames=dnames)
str(y)
# int [1:2, 1:3, 1:4, 1:5, 1:6] 1 2 3 4 5 6 7 8 9 10 ...
# - attr(*, "dimnames")=List of 5
# ..$ : chr [1:2] "A" "B"
# ..$ : chr [1:3] "C" "D" "E"
# ..$ : chr [1:4] "F" "G" "H" "I"
# ..$ : chr [1:5] "J" "K" "L" "M" ...
# ..$ : chr [1:6] "O" "P" "Q" "R" ...
attributes(y)
# $dim
# [1] 2 3 4 5 6
#
# $dimnames
# $dimnames[[1]]
# [1] "A" "B"
#
# $dimnames[[2]]
# [1] "C" "D" "E"
#
# $dimnames[[3]]
# [1] "F" "G" "H" "I"
#
# $dimnames[[4]]
# [1] "J" "K" "L" "M" "N"
#
# $dimnames[[5]]
# [1] "O" "P" "Q" "R" "S" "T"
Now we flatten the array and write it to a file:
z <- as.data.frame.table(y)
str(z)
# 'data.frame': 720 obs. of 6 variables:
# $ Var1: Factor w/ 2 levels "A","B": 1 2 1 2 1 2 1 2 1 2 ...
# $ Var2: Factor w/ 3 levels "C","D","E": 1 1 2 2 3 3 1 1 2 2 ...
# $ Var3: Factor w/ 4 levels "F","G","H","I": 1 1 1 1 1 1 2 2 2 2 ...
# $ Var4: Factor w/ 5 levels "J","K","L","M",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Var5: Factor w/ 6 levels "O","P","Q","R",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Freq: int 1 2 3 4 5 6 7 8 9 10 ...
write.csv(z, file="dfz.csv", row.names=FALSE)
Finally we read the file and convert it back to an array:
a <- read.csv("dfz.csv", as.is=FALSE)
b <- xtabs(Freq~., a)
class(b) <- "array"
attr(b, "call") <- NULL
names(dimnames(b)) <- NULL
str(b)
# int [1:2, 1:3, 1:4, 1:5, 1:6] 1 2 3 4 5 6 7 8 9 10 ...
# - attr(*, "dimnames")=List of 5
# ..$ : chr [1:2] "A" "B"
# ..$ : chr [1:3] "C" "D" "E"
# ..$ : chr [1:4] "F" "G" "H" "I"
# ..$ : chr [1:5] "J" "K" "L" "M" ...
# ..$ : chr [1:6] "O" "P" "Q" "R" ...

Extracting nth value from row vector in R

I have been searching/thinking of a way in which I can extract the nth value (e.g. 2nd, 5th, 7th, etc.) from each row in my data frame.
For example, I have the following columns:
ID Q1-2013 Q2-2013 Q3-2013 Q4-2013 Q1-2014 Q2-2014 Q3-2014 Q4-2014
Under each column there are given values. What I would like to do is pull the nth value of each row from the quarters vector (2nd-8th columns). So for example, if I am looking for the 2nd value from each row, the formula/function I want would extract/pull the 2nd value from each row from columns 2-8 (Q1-2013 to Q4-2014). In addition, the formula/function would ignore the blanks/NA values in each row as well.
Maybe this is what you're after.
I first modified the iris data set with some NAs in each column:
iris[] <- lapply(iris, function(x){ x[sample(150, 30, F)] <- NA; x})
head(iris)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1 5.1 3.5 1.4 NA setosa
#2 NA NA 1.4 NA setosa
#3 NA NA 1.3 0.2 setosa
#4 4.6 3.1 1.5 NA setosa
#5 5.0 3.6 1.4 0.2 setosa
#6 5.4 NA 1.7 0.4 setosa
Then, to extract the second non-empty and non-NA entries per row you could use apply (I know, it's not recommended on data frames, but it does the dirty job):
apply(iris, 1, function(x) x[which(!is.na(x) & x != "")[2]])
# [1] "3.5" "setosa" "0.2" "3.1" "3.6" "1.7" "3.4" "3.4" "2.9" "3.1" "setosa"
#[12] "3.4" "1.4" "1.1" "1.2" "4.4" "3.9" "3.5" "3.8" "3.8" "0.2" "3.7"
#[23] "3.6" "1.7" "1.9" "3.0" "3.4" "1.5" "3.4" "3.2" "3.1" "3.4" "4.1"
#[34] "4.2" "3.1" "3.2" "3.5" "3.6" "setosa" "1.5" "1.3" "2.3" "1.3" "0.6"
#[45] "0.4" "3.0" "3.8" "3.2" "3.7" "3.3" "3.2" "3.2" "1.5" "2.3" "2.8"
#[56] "2.8" "3.3" "2.4" "4.6" "1.4" "2.0" "3.0" "1.0" "2.9" "2.9" "3.1"
#[67] "3.0" "2.7" "4.5" "3.9" "3.2" "4.0" "2.5" "4.7" "4.3" "3.0" "2.8"
#[78] "5.0" "2.9" "3.5" "3.8" "2.4" "2.7" "2.7" "3.0" "3.4" "3.1" "1.3"
#[89] "4.1" "1.3" "2.6" "3.0" "2.6" "2.3" "4.2" "3.0" "2.9" "2.9" "2.5"
#[100] "2.8" "3.3" "2.7" "3.0" "2.9" "3.0" "3.0" "4.5" "2.9" "5.8" "3.6"
#[111] "3.2" "1.9" "5.5" "2.0" "5.1" "3.2" "5.5" "3.8" "virginica" "1.5" "3.2"
#[122] "2.8" "2.8" "2.7" "2.1" "6.0" "2.8" "3.0" "2.8" "5.8" "2.8" "3.8"
#[133] "5.6" "1.5" "2.6" "3.0" "5.6" "5.5" "4.8" "3.1" "5.6" "5.1" "2.7"
#[144] "3.2" "3.3" "3.0" "2.5" "5.2" "5.4" "3.0"
Because apply will first convert the data frame to a matrix, all columns are covnerted to the same type which is character in this case. You can later on convert it to whatever you want (but note that you cant convert the output vector in this case directly back to numeric since it contains some character strings such as "setosa" etc).
You could also use a convenient function naLast from library(SOfun)
library(SOfun)
dat[dat==''] <- NA #convert all `blank` cells to `NA`
n <- 2 # the row/column index that needs to be extracted
naLast(dat, by='col')[n,] #get the 2nd non-empty/nonNA element for each columns
#V1 V2 V3 V4 V5
#"G" "B" "B" "B" "C"
which would be the same with apply
apply(dat, 2, function(x) x[which(!is.na(x) & x!='')[2]])
#V1 V2 V3 V4 V5
#"G" "B" "B" "B" "C"
You could also specify by='row'
naLast(dat, by='row')[,n] #get the 2nd non-empty/nonNA element for each row
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#"G" "D" "B" "G" "E" "B" "J" "F" "F" "A" "H" "C" "A" "D" "H" "D" "J" "C" "A" "A"
data
set.seed(25)
dat <- as.data.frame(matrix(sample(c(NA,'',LETTERS[1:10]),
20*5, replace=TRUE), ncol=5), stringsAsFactors=FALSE)
You can install the package by
library(devtools)
install_github("mrdwab/SOfun")

Access the levels of a factor in R

I have a 5-level factor that looks like the following:
tmp
[1] NA
[2] 1,2,3,6,11,12,13,18,20,21,22,26,29,33,40,43,46
[3] NA
[4] NA
[5] 5,9,16,24,35,36,42
[6] 4,7,10,14,15,17,19,23,25,27,28,30,31,32,34,37,38,41,44,45,47,48,49,50
[7] 8,39
5 Levels: 1,2,3,6,11,12,13,18,20,21,22,26,29,33,40,43,46 ...
I want to access the items within each level except NA. So I use the levels() function, which gives me:
> levels(tmp)
[1] "1,2,3,6,11,12,13,18,20,21,22,26,29,33,40,43,46"
[2] "4,7,10,14,15,17,19,23,25,27,28,30,31,32,34,37,38,41,44,45,47,48,49,50"
[3] "5,9,16,24,35,36,42"
[4] "8,39"
[5] "NA"
Then I would like to access the elements in each level, and store them as numbers. However, for example,
>as.numeric(cat(levels(tmp)[3]))
5,9,16,24,35,36,42numeric(0)
Can you help me removing the commas within the numbers and the numeric(0) at the very end. I would like to have a vector of numerics 5, 9, 16, 24, 35, 36, 42 so that I can use them as indices to access a data frame. Thanks!
You need to use a combination of unlist, strsplit and unique.
First, recreate your data:
dat <- read.table(text="
NA
1,2,3,6,11,12,13,18,20,21,22,26,29,33,40,43,46
NA
NA
5,9,16,24,35,36,42
4,7,10,14,15,17,19,23,25,27,28,30,31,32,34,37,38,41,44,45,47,48,49,50
8,39")$V1
Next, find all the unique levels, after using strsplit:
sort(unique(unlist(
sapply(levels(dat), function(x)unlist(strsplit(x, split=",")))
)))
[1] "1" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "2" "20" "21" "22" "23" "24" "25" "26"
[20] "27" "28" "29" "3" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "4" "40" "41" "42" "43"
[39] "44" "45" "46" "47" "48" "49" "5" "50" "6" "7" "8" "9"
Does this do what you want?
levels_split <- strsplit(levels(tmp), ",")
lapply(levels_split, as.numeric)
Using Andrie's dat
val <- scan(text=levels(dat),sep=",")
#Read 50 items
split(val,cumsum(c(T,diff(val) <0)))
#$`1`
#[1] 1 2 3 6 11 12 13 18 20 21 22 26 29 33 40 43 46
#$`2`
#[1] 4 7 10 14 15 17 19 23 25 27 28 30 31 32 34 37 38 41 44 45 47 48 49 50
#$`3`
#[1] 5 9 16 24 35 36 42
#$`4`
#[1] 8 39

reformatting data frame with List in R

Helo, I am trying to reshape a data.frame in R such that each row will repeat with a different value from a list, then the next row will repeat from a differing value from the second entry of the list.
the list is called, wrk, dfx is the dataframe I want to reshape, and listOut is what I want to end up with.
Thank you very much for your help.
> wrk
[[1]]
[1] "41" "42" "44" "45" "97" "99" "100" "101" "102"
[10] "103" "105" "123" "124" "126" "127" "130" "132" "135"
[19] "136" "137" "138" "139" "140" "141" "158" "159" "160"
[28] "161" "162" "163" "221" "223" "224" ""
[[2]]
[1] "41" "42" "44" "45" "98" "99" "100" "101" "102"
[10] "103" "105" "123" "124" "126" "127" "130" "132" "135"
[19] "136" "137" "138" "139" "140" "141" "158" "159" "160"
[28] "161" "162" "163" "221" "223" "224" ""
>dfx
projectScore highestRankingGroup
1 0.8852 1
2 0.8845 2
>listOut
projectScore highestRankingGroup wrk
1 0.8852 1 41
2 0.8852 1 42
3 0.8852 1 44
4 0.8852 1 45
5 0.8852 1 97
6 0.8852 1 99
7 0.8852 1 100
8 0.8852 1 101
...
35 0.8845 2 41
36 0.8845 2 42
37 0.8845 2 44
38 0.8845 2 45
39 0.8845 2 98
40 0.8845 2 99
41 0.8845 2 100
How about replicate rows of dfx and cbind with unlisted wrk:
listOut <- cbind(
dfx[rep(seq_along(wrk), sapply(wrk, length)), ],
wrk = unlist(wrk)
)
How about:
If wrk contains simple vectors like in your example:
> szs<-sapply(wrk, length)
> fulldfr<-do.call(c, wrk)
> listOut<-cbind(dfx[rep(seq_along(szs), szs),], fulldfr)
If wrk contains dataframes:
> szs<-sapply(wrk, function(dfr){dim(dfr)[1]})
> fulldfr<-do.call(rbind, wrk)
> listOut<-cbind(dfx[rep(seq_along(szs), szs),], fulldfr)
How about:
expand.grid(dfx$projectScore, dfx$highestRankingGroup, wrk[[1]])
Edit:
Maybe you can eleborate a bit more, because this does seem to work:
a <- c("41","42","44","45","97","99","100","101","102","103","105", "123","124","126","127","130","132","135","136","137","138","139","140","141","158","159","160","161","162","163","221","223","224")
wrk <-list(a, a)
dfx <- data.frame(projectScore=c(0.8852, 0.8845), highestRankingGroup=c(1,2))
listOut <- expand.grid(dfx$projectScore, dfx$highestRankingGroup, wrk[[1]])
names(listOut) <- c("projectScore", "highestRankingGroup", "wrk")
listOut[order(-listOut$projectScore,listOut$highestRankingGroup, listOut$wrk),]

Resources