R tidyr::spread duplicate error - r

I have the following data:
ID AGE SEX RACE COUNTRY VISITNUM VSDTC VSTESTCD VSORRES
32320058 58 M WHITE UKRAINE 2 2016-04-28 DIABP 74
32320058 58 M WHITE UKRAINE 1 2016-04-21 HEIGHT 183
32320058 58 M WHITE UKRAINE 1 2016-04-21 SYSBP 116
32320058 58 M WHITE UKRAINE 2 2016-04-28 SYSBP 116
32320058 58 M WHITE UKRAINE 1 2016-04-21 WEIGHT 109
22080090 75 M WHITE MEXICO 1 2016-05-17 DIABP 81
22080090 75 M WHITE MEXICO 1 2016-05-17 HEIGHT 176
22080090 75 M WHITE MEXICO 1 2016-05-17 SYSBP 151
I would like to reshape the data using tidyr::spread to get the following output:
ID AGE SEX RACE COUNTRY VISITNUM VSDTC DIABP SYSBP WEIGHT HEIGHT
32320058 58 M WHITE UKRAINE 2 2016-04-28 74 116 NA NA
32320058 58 M WHITE UKRAINE 1 2016-04-21 NA 116 109 183
22080090 75 M WHITE MEXICO 1 2016-05-17 81 151 NA 176
I receive duplicate errors, although I don't have duplicates in my data!
df1=spread(df,VSTESTCD,VSORRES)
Error: Duplicate identifiers for rows (36282, 36283), (59176, 59177), (59179, 59180)

I assume that I understand your question
# As many rows are identical, we should create a unique identifier column
# Let's take iris dataset as an example
# install caret package if you don't have it
install.packages("caret")
# require library
library(tidyverse)
library(caret)
# check the dataset (iris)
head(iris)
# assume that I gather all columns in iris dataset, except Species variable
# Create an unique identifier column and transform wide data to long data as follow
iris_gather<- iris %>% dplyr::mutate(ID=row_number(Species)) %>% tidyr::gather(key=Type,value=my_value,1:4)
# check first six rows
head(iris_gather)
# using *spread* to spread out the data
iris_spread<- iris_gather %>% dplyr::group_by(ID) %>% tidyr::spread(key=Type,value=my_value) %>% dplyr::ungroup() %>% dplyr::select(-ID)
# Check first six rows of iris_spread
head(iris_spread)

Related

R stops scraping when there is missing data

I am using this code to loop through multiple url's to scrape data. The code works fine until it comes to a date that has missing data. This is the error message that pops up:
Error in data.frame(away, home, away1H, home1H, awayPinnacle, homePinnacle) :
arguments imply differing number of rows: 7, 8
I am very new to coding and could not figure out how to make it keep scraping despite the missing data.
library(rvest)
library(dplyr)
get_data <- function(date) {
# Specifying URL
url <- paste0('https://classic.sportsbookreview.com/betting-odds/nba-basketball/money-line/1st-half/?date=', date)
# Reading the HTML code from website
oddspage <- read_html(url)
# Using CSS selectors to scrape away teams
awayHtml <- html_nodes(oddspage,'.eventLine-value:nth-child(1) a')
#Using CSS selectors to scrape 1Q scores
away1QHtml <- html_nodes(oddspage,'.current-score+ .first')
away1Q <- html_text(away1QHtml)
away1Q <- as.numeric(away1Q)
home1QHtml <- html_nodes(oddspage,'.score-periods+ .score-periods .current-score+ .period')
home1Q <- html_text(home1QHtml)
home1Q <- as.numeric(home1Q)
#Using CSS selectors to scrape 2Q scores
away2QHtml <- html_nodes(oddspage,'.first:nth-child(3)')
away2Q <- html_text(away2QHtml)
away2Q <- as.numeric(away2Q)
home2QHtml <- html_nodes(oddspage,'.score-periods+ .score-periods .period:nth-child(3)')
home2Q <- html_text(home2QHtml)
home2Q <- as.numeric(home2Q)
#Creating First Half Scores
away1H <- away1Q + away2Q
home1H <- home1Q + home2Q
#Using CSS selectors to scrape scores
awayScoreHtml <- html_nodes(oddspage,'.first.total')
awayScore <- html_text(awayScoreHtml)
awayScore <- as.numeric(awayScore)
homeScoreHtml <- html_nodes(oddspage, '.score-periods+ .score-periods .total')
homeScore <- html_text(homeScoreHtml)
homeScore <- as.numeric(homeScore)
# Converting away data to text
away <- html_text(awayHtml)
# Using CSS selectors to scrape home teams
homeHtml <- html_nodes(oddspage,'.eventLine-value+ .eventLine-value a')
# Converting home data to text
home <- html_text(homeHtml)
# Using CSS selectors to scrape Away Odds
awayPinnacleHtml <- html_nodes(oddspage,'.eventLine-consensus+ .eventLine-book .eventLine-book-value:nth-child(1) b')
# Converting Away Odds to Text
awayPinnacle <- html_text(awayPinnacleHtml)
# Converting Away Odds to numeric
awayPinnacle <- as.numeric(awayPinnacle)
# Using CSS selectors to scrape Pinnacle Home Odds
homePinnacleHtml <- html_nodes(oddspage,'.eventLine-consensus+ .eventLine-book .eventLine-book-value+ .eventLine-book-value b')
# Converting Home Odds to Text
homePinnacle <- html_text(homePinnacleHtml)
# Converting Home Odds to Numeric
homePinnacle <- as.numeric(homePinnacle)
# Create Data Frame
df <- data.frame(away,home,away1H,home1H,awayPinnacle,homePinnacle)
}
date_vec <- sprintf('201902%02d', 02:06)
all_data <- do.call(rbind, lapply(date_vec, get_data))
View(all_data)
I'd recommending purrr::map() instead of lapply. Then you can wrap your call to get_data() with possibly(), which is a nice way to catch errors and keep going.
library(purrr)
map_dfr(date_vec, possibly(get_data, otherwise = data.frame()))
Output:
away home away1H home1H awayPinnacle homePinnacle
1 L.A. Clippers Detroit 47 65 116 -131
2 Milwaukee Washington 73 50 -181 159
3 Chicago Charlotte 60 51 192 -220
4 Brooklyn Orlando 48 44 121 -137
5 Indiana Miami 53 54 117 -133
6 Dallas Cleveland 58 55 -159 140
7 L.A. Lakers Golden State 58 63 513 -651
8 New Orleans San Antonio 50 63 298 -352
9 Denver Minnesota 61 64 107 -121
10 Houston Utah 63 50 186 -213
11 Atlanta Phoenix 58 57 110 -125
12 Philadelphia Sacramento 52 62 -139 123
13 Memphis New York 42 41 -129 114
14 Oklahoma City Boston 58 66 137 -156
15 L.A. Clippers Toronto 51 65 228 -263
16 Atlanta Washington 61 57 172 -196
17 Denver Detroit 55 68 -112 -101
18 Milwaukee Brooklyn 51 42 -211 184
19 Indiana New Orleans 53 50 -143 127
20 Houston Phoenix 63 57 -256 222
21 San Antonio Sacramento 59 63 -124 110

Convert row names into new columns in a data frame

Apologies in advance if this has already been asked elsewhere, but I've tried different attempts and nothing has worked so far.
In my data frame Mesure I would like to split the values of the column Row.names into two new columns named Sample_type and Locality. I try to use a tidyverse solution but R returns me that the column must not be dupicated... How can I modify it ? Also, is it possible to remove the "<" ?
> head(Mesure)
Row.names mean_Mesure max_Mesure min_Mesure
1 Aquatic_moss.Paris.AG-110m.< 100 110 90
2 Aquatic_moss.Paris.BE-7. 123 177 53
3 Aquatic_moss.Paris.CO-57.< 40 60 20
4 Aquatic_moss.Paris.CO-58.< 40 50 30
5 Aquatic_moss.Paris.CO-60.< 50 70 30
6 Aquatic_moss.Paris.CS-134.< 200 300 100
>
> library(tidyverse)
> new_df <- Mesure %>%
+ rownames_to_column(var = "Row.names") %>%
+ separate(Row.names,sep = ".",into = c("Sample_type","Locality"))
Error: Column name `Row.names` must not be duplicated.
Run `rlang::last_error()` to see where the error occurred.
To separate that with the first "dot" you can use:
Mesure %>%
separate(Row.names, sep = "\\.", into = c("Sample_type", "Locality"), extra = "merge")
Explanation:
You don't need to convert rownames_to_column(), because "Row.names" is already a column.
sep = "." is not enough as the . is taken as a regular expression.
There are many . in the column, so you need to specify extra = "merge" to separate only at first appearance. If you would like to keep only "Paris" without AG-110m etc, you specify extra = "drop" there.
Result with extra = "merge":
Sample_type Locality mean_Mesure max_Mesure min_Mesure
1 Aquatic_moss Paris.AG-110m.< 100 110 90
2 Aquatic_moss Paris.BE-7. 123 177 53
3 Aquatic_moss Paris.CO-57.< 40 60 20
4 Aquatic_moss Paris.CO-58.< 40 50 30
5 Aquatic_moss Paris.CO-60.< 50 70 30
6 Aquatic_moss Paris.CS-134.< 200 300 100
Result with extra = "drop":
Sample_type Locality mean_Mesure max_Mesure min_Mesure
1 Aquatic_moss Paris 100 110 90
2 Aquatic_moss Paris 123 177 53
3 Aquatic_moss Paris 40 60 20
4 Aquatic_moss Paris 40 50 30
5 Aquatic_moss Paris 50 70 30
6 Aquatic_moss Paris 200 300 100
If you need to drop "<" at the end of Locality column, run something like:
Mesure$Locality <- gsub("<$", "", Mesure$Locality)
where "<$" means "< at the end of the string".
Apologies. I should read your question properly. The second part of your answer would be:
d %>% separate(Row.names, into=c("Sample_type","Locality"), extra="drop")
# A tibble: 6 x 6
Sample_type Locality mean_Mesure max_Mesure min_Mesure
<chr> <chr> <dbl> <dbl> <dbl>
1 Aquatic moss 100 110 90
2 Aquatic moss 123 177 53
3 Aquatic moss 40 60 20
4 Aquatic moss 40 50 30
5 Aquatic moss 50 70 30
6 Aquatic moss 200 300 100
I can't help you with the first part because I don't know how you create the input data frame.

Using str_split to fill rows down data frame with number ranges and multiple numbers

I have a dataframe with crop names and their respective FAO codes. Unfortunately, some crop categories, such as 'other cereals', have multiple FAO codes, ranges of FAO codes or even worse - multiple ranges of FAO codes.
Snippet of the dataframe with the different formats for FAO codes.
> FAOCODE_crops
SPAM_full_name FAOCODE
1 wheat 15
2 rice 27
8 other cereals 68,71,75,89,92,94,97,101,103,108
27 other oil crops 260:310,312:339
31 other fibre crops 773:821
Using the following code successfully breaks down these numbers,
unlist(lapply(unlist(strsplit(FAOCODE_crops$FAOCODE, ",")), function(x) eval(parse(text = x))))
[1] 15 27 56 44 79 79 83 68 71 75 89 92 94 97 101 103 108
... but I fail to merge these numbers back into the dataframe, where every FAOCODE gets its own row.
> FAOCODE_crops$FAOCODE <- unlist(lapply(unlist(strsplit(MAPSPAM_crops$FAOCODE, ",")), function(x) eval(parse(text = x))))
Error in `$<-.data.frame`(`*tmp*`, FAOCODE, value = c(15, 27, 56, 44, :
replacement has 571 rows, data has 42
I fully understand why it doesn't merge successfully, but I can't figure out a way to fill the table with a new row for each FAOCODE as idealized below:
SPAM_full_name FAOCODE
1 wheat 15
2 rice 27
8 other cereals 68
8 other cereals 71
8 other cereals 75
8 other cereals 89
And so on...
Any help is greatly appreciated!
We can use separate_rows to separate the ,. After that, we can loop through the FAOCODE using map and ~eval(parse(text = .x)) to evaluate the number range. Finnaly, we can use unnest to expand the data frame.
library(tidyverse)
dat2 <- dat %>%
separate_rows(FAOCODE, sep = ",") %>%
mutate(FAOCODE = map(FAOCODE, ~eval(parse(text = .x)))) %>%
unnest(cols = FAOCODE)
dat2
# # A tibble: 140 x 2
# SPAM_full_name FAOCODE
# <chr> <dbl>
# 1 wheat 15
# 2 rice 27
# 3 other cereals 68
# 4 other cereals 71
# 5 other cereals 75
# 6 other cereals 89
# 7 other cereals 92
# 8 other cereals 94
# 9 other cereals 97
# 10 other cereals 101
# # ... with 130 more rows
DATA
dat <- read.table(text = " SPAM_full_name FAOCODE
1 wheat 15
2 rice 27
8 'other cereals' '68,71,75,89,92,94,97,101,103,108'
27 'other oil crops' '260:310,312:339'
31 'other fibre crops' '773:821'",
header = TRUE, stringsAsFactors = FALSE)

How to create a Markdown table with different column lengths based on a dataframe in long format in R?

I'm working on a R Markdown file that I would like to submit as a manuscript to an academic journal. I would like to create a table that shows which three words (item2) co-occur most frequently with some keywords (item1). Note that some key words have more than three co-occurring words. The data that I am currently working with:
item1 <- c("water","water","water","water","water","sun","sun","sun","sun","moon","moon","moon")
item2 <- c("tree","dog","cat","fish","eagle","bird","table","bed","flower","house","desk","tiger")
n <- c("200","83","34","34","34","300","250","77","77","122","46","46")
df <- data.frame(item1,item2,n)
Which gives this dataframe:
item1 item2 n
1 water tree 200
2 water dog 83
3 water cat 34
4 water fish 34
5 water eagle 34
6 sun bird 300
7 sun table 250
8 sun bed 77
9 sun flower 77
10 moon house 122
11 moon desk 46
12 moon tiger 46
Ultimately, I would like to pass the data to the function papaja::apa_table, which requires a data.frame (or a matrix / list). I therefore need to reshape the data.
My question:
How can I reshape the data (preferably with dplyr) to get the following structure?
water_item2 water_n sun_item2 sun_n moon_item2 moon_n
1 tree 200 bird 300 house 122
2 dog 83 table 250 desk 46
3 cat 34 bed 77 tiger 46
4 fish 34 flower 77 <NA> <NA>
5 eagle 34 <NA> <NA> <NA> <NA>
We can borrow an approach from an old answer of mine to a different question, and modify a classic gather(), unite(), spread() strategy by creating unique identifiers by group to avoid duplicate identifiers, then dropping that variable:
library(dplyr)
library(tidyr)
item1 <- c("water","water","water","water","water","sun","sun","sun","sun","moon","moon","moon")
item2 <- c("tree","dog","cat","fish","eagle","bird","table","bed","flower","house","desk","tiger")
n <- c("200","83","34","34","34","300","250","77","77","122","46","46")
# Owing to Richard Telford's excellent comment,
# I use data_frame() (or equivalently for our purposes,
# data.frame(..., stringsAsFactors = FALSE))
# to avoid turning the strings into factors
df <- data_frame(item1,item2,n)
df %>%
group_by(item1) %>%
mutate(id = 1:n()) %>%
ungroup() %>%
gather(temp, val, item2, n) %>%
unite(temp2, item1, temp, sep = '_') %>%
spread(temp2, val) %>%
select(-id)
# A tibble: 5 x 6
moon_item2 moon_n sun_item2 sun_n water_item2 water_n
<chr> <chr> <chr> <chr> <chr> <chr>
1 house 122 bird 300 tree 200
2 desk 46 table 250 dog 83
3 tiger 46 bed 77 cat 34
4 NA NA flower 77 fish 34
5 NA NA NA NA eagle 34

Performing a row by row chisq test on a data frame and capturing the result as a tibble

I have a data frame similar to this:
df1 <- data.frame(c(31,3447,12,1966,39,3275),
c(20,3460,10,1968,30,3284),
c(334,3146,212,1766,338,2976),
c(36,3442,35,1943,47,3267),
c(81,3399,71,1907,112,3202),
c(22,3458,22,1956,42,3272))
colnames(df1) <- c("Site1.C1","Site1.C2","Site2.C1","Site2.C2","Site3.C1","Site3.C2")
df1
Site1.C1 Site1.C2 Site2.C1 Site2.C2 Site3.C1 Site3.C2
1 31 20 334 36 81 22
2 3447 3460 3146 3442 3399 3458
3 12 10 212 35 71 22
4 1966 1968 1766 1943 1907 1956
5 39 30 338 47 112 42
6 3275 3284 2976 3267 3202 3272
I am converting each row into a table and then performing a chisq test.
In order get specific values from the chisq result (p value, parameter, statistic, expected, etc), I'm having to repeat chisq test several times over (in a very ugly and cumbersome way), using the following code:
df2 <- df1 %>% rowwise() %>% mutate(P=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$p.value,
df=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$parameter,
Site1.c1.exp=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$expected[1,1],
Site1.c2.exp=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$expected[1,2],
Site2.c1.exp=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$expected[2,1],
Site2.c2.exp=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$expected[2,2],
Site3.c1.exp=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$expected[3,1],
Site3.c2.exp=chisq.test(rbind(c(Site1.C1,Site1.C2),c(Site2.C1,Site2.C2),c(Site3.C1,Site3.C2)))$expected[3,2])
as.data.frame(df2)
Site1.C1 Site1.C2 Site2.C1 Site2.C2 Site3.C1 Site3.C2 P df Site1.c1.exp Site1.c2.exp Site2.c1.exp Site2.c2.exp Site3.c1.exp Site3.c2.exp
1 31 20 334 36 81 22 2.513166e-08 2 43.40840 7.591603 314.9237 55.07634 87.66794 15.33206
2 3447 3460 3146 3442 3399 3458 2.760225e-02 2 3391.05464 3515.945362 3234.4387 3353.56132 3366.50668 3490.49332
3 12 10 212 35 71 22 4.743725e-04 2 17.92818 4.071823 201.2845 45.71547 75.78729 17.21271
4 1966 1968 1766 1943 1907 1956 1.026376e-01 2 1928.02242 2005.977577 1817.7517 1891.24831 1893.22588 1969.77412
5 39 30 338 47 112 42 2.632225e-10 2 55.49507 13.504934 309.6464 75.35362 123.85855 30.14145
6 3275 3284 2976 3267 3202 3272 2.686389e-02 2 3216.55048 3342.449523 3061.5833 3181.41674 3174.86626 3299.13374
Is there a more elegant way to do chisq test just once and capture the result as a tibble in the same row and then extract values on a need-to basis into additional columns?
My data frame has over a million of rows and some additional variables not used with the Chisq test.
Thank you.
With input from #akrun, I was able to get the desired result using the following code:
df2 <- df1 %>% rowwise() %>% mutate(result=list(chisq.test(rbind(c(Site1.C1,Site1.C2),c(S‌​ite2.C1,Site2.C2),c(‌​Site3.C1,Site3.C2)))‌​))

Resources