I want to scrape the player data table from the following URL:
https://www.transfermarkt.de/mamadou-doucoure/profil/spieler/340480
Here's what I coded:
x <- read_html(url) %>%
html_node(xpath = '//div[#class="row collapse"]') %>%
html_table(fill = TRUE) %>%
as.data.frame() %>%
set_names(.,letters[1:ncol(.)])
As far as I understand, the player data isn't classed as a table, and I don't know how to edit the code. Also, I want to have the output in a data frame.
Dataframe could have many forms, having that player table in dataframe as-is might not be the most practical way, though here are a few examples. Some parts are bit tricky and solving those correctly depends on context and objective (e.g. multiple nationalities that currently end up as a single collapsed value)
library(rvest)
library(dplyr, warn.conflicts = F)
library(tidyr)
library(stringr)
url <- "https://www.transfermarkt.de/mamadou-doucoure/profil/spieler/340480"
html <- read_html(url)
# most basic aproach to extract just what's in the table + player name:
df_01 <- tibble(
feature = html_elements(html, "div.info-table > span.info-table__content--regular") %>% html_text() %>% str_squish(),
text = html_elements(html, "div.info-table > span.info-table__content--bold") %>% html_text() %>% str_squish()
) %>%
# player name is not included in div.info-table, add it separately
add_row(.before = 1,
feature = "Player:",
text = html_elements(html, "header > div.data-header__headline-container > h1") %>% html_text() %>% str_squish())
df_01
#> # A tibble: 15 × 2
#> feature text
#> <chr> <chr>
#> 1 Player: "#4 Mamadou Doucouré"
#> 2 Geburtsdatum: "21.05.1998"
#> 3 Geburtsort: "Dakar"
#> 4 Alter: "24"
#> 5 Größe: "1,83 m"
#> 6 Nationalität: "Frankreich Senegal"
#> 7 Position: "Abwehr - Innenverteidiger"
#> 8 Fuß: "links"
#> 9 Spielerberater: "Sport Avenir Management International"
#> 10 Aktueller Verein: "Borussia Mönchengladbach"
#> 11 Im Team seit: "01.07.2016"
#> 12 Vertrag bis: "30.06.2024"
#> 13 Letzte Verlängerung: "14.02.2020"
#> 14 2. Verein: "Borussia Mönchengladbach II (#3)"
#> 15 Social Media: ""
To include URLs we handle the first info-table column as before but processes
2nd one through map - not all entries have URLs and we don't want to end up with misaligned columns with different lengths:
df_02 <- tibble(
feature = html_elements(html, "div.info-table > span.info-table__content--regular") %>% html_text() %>% str_squish(),
) %>% bind_cols(
purrr::map_df(
html_elements(html, "div.info-table > span.info-table__content--bold"),
~ list(
html_text(.x) %>% stringr::str_squish() %>% na_if(""),
html_element(.x, "a") %>% html_attr("href")
) %>% setNames(c("text", "url"))
)
) %>% add_row(.before = 1,
feature = "Player:",
text = html_elements(html, "header > div.data-header__headline-container > h1") %>% html_text() %>% stringr::str_squish())
df_02
#> # A tibble: 15 × 3
#> feature text url
#> <chr> <chr> <chr>
#> 1 Player: #4 Mamadou Doucouré <NA>
#> 2 Geburtsdatum: 21.05.1998 /aktuell/waspassi…
#> 3 Geburtsort: Dakar <NA>
#> 4 Alter: 24 <NA>
#> 5 Größe: 1,83 m <NA>
#> 6 Nationalität: Frankreich Senegal <NA>
#> 7 Position: Abwehr - Innenverteidiger <NA>
#> 8 Fuß: links <NA>
#> 9 Spielerberater: Sport Avenir Management International /sport-avenir-man…
#> 10 Aktueller Verein: Borussia Mönchengladbach /borussia-monchen…
#> 11 Im Team seit: 01.07.2016 <NA>
#> 12 Vertrag bis: 30.06.2024 <NA>
#> 13 Letzte Verlängerung: 14.02.2020 <NA>
#> 14 2. Verein: Borussia Mönchengladbach II (#3) /borussia-monchen…
#> 15 Social Media: <NA> http://www.instag…
To have a tidy dataframe that could potentially take more players, missing text values are replaced by URLs and separate URL column is dropped:
df_03 <- df_02 %>%
mutate(feature = janitor::make_clean_names(feature),
`text` = coalesce(`text`,url)) %>%
select(-url) %>%
pivot_wider(names_from = feature, values_from = text) %>%
extract(player, into = c("number", "player"), "^#(\\d+) (.*)")
glimpse(df_03)
#> Rows: 1
#> Columns: 16
#> $ number <chr> "4"
#> $ player <chr> "Mamadou Doucouré"
#> $ geburtsdatum <chr> "21.05.1998"
#> $ geburtsort <chr> "Dakar"
#> $ alter <chr> "24"
#> $ grosse <chr> "1,83 m"
#> $ nationalitat <chr> "Frankreich Senegal"
#> $ position <chr> "Abwehr - Innenverteidiger"
#> $ fuss <chr> "links"
#> $ spielerberater <chr> "Sport Avenir Management International"
#> $ aktueller_verein <chr> "Borussia Mönchengladbach"
#> $ im_team_seit <chr> "01.07.2016"
#> $ vertrag_bis <chr> "30.06.2024"
#> $ letzte_verlangerung <chr> "14.02.2020"
#> $ x2_verein <chr> "Borussia Mönchengladbach II (#3)"
#> $ social_media <chr> "http://www.instagram.com/mams_dcr/"
Does anyone here have experience in identifying the most common phrases (3 ~ 7 consecutive words)? Understand that most analysis on frequency focuses on the most frequent/common word (along with plotting a WordCloud) rather than phrases.
# Assuming a particular column in a data frame (df) with n rows that is all text data
# as I'm not able to provide a sample data as using dput() on a large text file won't # be feasible here
Text = df$Text_Column
docs = Corpus(VectorSource(Text))
...
Thanks in advance!
You have several options to do this in R. Let's grab some data first. I use the books by Jane Austen from the janeaustenr and do some cleaning to have each paragrah in a separate row:
library(janeaustenr)
library(tidyverse)
books <- austen_books() %>%
mutate(paragraph = cumsum(text == "" & lag(text) != "")) %>%
group_by(paragraph) %>%
summarise(book = head(book, 1),
text = trimws(paste(text, collapse = " ")),
.groups = "drop")
With tidytext:
library(tidytext)
map_df(3L:7L, ~unnest_tokens(books, ngram, text, token = "ngrams", n = .x)) %>% # using multiple values for n is not directly implemented in tidytext
count(ngram) %>%
filter(!is.na(ngram)) %>%
slice_max(n, n = 10)
#> # A tibble: 10 × 2
#> ngram n
#> <chr> <int>
#> 1 i am sure 415
#> 2 i do not 412
#> 3 she could not 328
#> 4 it would be 258
#> 5 in the world 247
#> 6 as soon as 236
#> 7 a great deal 214
#> 8 would have been 211
#> 9 she had been 203
#> 10 it was a 202
With quanteda:
library(quanteda)
books %>%
corpus(docid_field = "paragraph",
text_field = "text") %>%
tokens(remove_punct = TRUE,
remove_symbols = TRUE) %>%
tokens_ngrams(n = 3L:7L) %>%
dfm() %>%
topfeatures(n = 10) %>%
enframe()
#> # A tibble: 10 × 2
#> name value
#> <chr> <dbl>
#> 1 i_am_sure 415
#> 2 i_do_not 412
#> 3 she_could_not 328
#> 4 it_would_be 258
#> 5 in_the_world 247
#> 6 as_soon_as 236
#> 7 a_great_deal 214
#> 8 would_have_been 211
#> 9 she_had_been 203
#> 10 it_was_a 202
With text2vec:
library(text2vec)
library(janeaustenr)
library(tidyverse)
books <- austen_books() %>%
mutate(paragraph = cumsum(text == "" & lag(text) != "")) %>%
group_by(paragraph) %>%
summarise(book = head(book, 1),
text = trimws(paste(text, collapse = " ")),
.groups = "drop")
library(text2vec)
itoken(books$text, tolower, word_tokenizer) %>%
create_vocabulary(ngram = c(3L, 7L), sep_ngram = " ") %>%
filter(str_detect(term, "[[:alpha:]]")) %>% # keep terms with at tleas one alphabetic character
slice_max(term_count, n = 10)
#> Number of docs: 10293
#> 0 stopwords: ...
#> ngram_min = 3; ngram_max = 7
#> Vocabulary:
#> term term_count doc_count
#> 1: i am sure 415 384
#> 2: i do not 412 363
#> 3: she could not 328 288
#> 4: it would be 258 233
#> 5: in the world 247 234
#> 6: as soon as 236 233
#> 7: a great deal 214 209
#> 8: would have been 211 192
#> 9: she had been 203 179
#> 10: it was a 202 194
Created on 2022-08-03 by the reprex package (v2.0.1)
The following function behaves as desired: several variables can be passed to group_by without the need to put them into alist() or dplyr::vars:
mean_by_grp <- function(df, meanvar, grp) {
grouping <- enexpr(grp) %>%
expr_deparse %>%
str_split(",",simplify = T) %>% `[`(1,) %>%
map(str_remove,"c\\(") %>% map(str_remove,"\\)") %>% map(str_trim) %>%
unlist %>% syms
df %>%
group_by(!!!syms(grouping)) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
starwars %>% mean_by_grp(height, species)
starwars %>% mean_by_grp(height, c(species, homeworld))
However, it is complicated. I need to turn c(var1,....varn) into a string, split it and turn it into a list of symbols so I can use with with syms.
Isn't there a much easier way to do this?
Of course, I could use ellipses instead of grp, but then I can only have one argument that passes multiple symbols to another function.
One option would be dplyr::across:
mean_by_grp <- function(df, meanvar, grp) {
df %>%
group_by(across({{ grp }})) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
library(dplyr)
starwars %>% mean_by_grp(height, species)
#> # A tibble: 38 × 2
#> species average_height
#> <chr> <dbl>
#> 1 Aleena 79
#> 2 Besalisk 198
#> 3 Cerean 198
#> 4 Chagrian 196
#> 5 Clawdite 168
#> 6 Droid 131.
#> 7 Dug 112
#> 8 Ewok 88
#> 9 Geonosian 183
#> 10 Gungan 209.
#> # … with 28 more rows
starwars %>% mean_by_grp(height, c(species, homeworld))
#> # A tibble: 58 × 3
#> species homeworld average_height
#> <chr> <chr> <dbl>
#> 1 Aleena Aleen Minor 79
#> 2 Besalisk Ojom 198
#> 3 Cerean Cerea 198
#> 4 Chagrian Champala 196
#> 5 Clawdite Zolan 168
#> 6 Droid Naboo 96
#> 7 Droid Tatooine 132
#> 8 Droid <NA> 148
#> 9 Dug Malastare 112
#> 10 Ewok Endor 88
#> # … with 48 more rows
Lets suppose I have the following list of tibbles:
a_list_of_tibbles <- list(
a = tibble(a = rnorm(10)),
b = tibble(a = runif(10)),
c = tibble(a = letters[1:10])
)
Now I want to map them all into a single dataframe/tibble, which is not possible due to the differing column types.
How would I go about this?
I have tried this, but I want to get rid of the for loop
for(i in 1:length(a_list_of_tibbles)){
a_list_of_tibbles[[i]] <- a_list_of_tibbles[[i]] %>% mutate_all(as.character)
}
Then I run:
map_dfr(.x = a_list_of_tibbles, .f = as_tibble)
We could do the computation within the map - use across instead of the suffix _all (which is getting deprecated) to loop over the columns of the dataset
library(dplyr)
library(purrr)
map_dfr(a_list_of_tibbles,
~.x %>%
mutate(across(everything(), as.character) %>%
as_tibble))
-output
# A tibble: 30 × 1
a
<chr>
1 0.735200825884485
2 1.4741501589461
3 1.39870958697574
4 -0.36046362308853
5 -0.893860999301402
6 -0.565468636033674
7 -0.075270267983768
8 2.33534260196058
9 0.69667906338348
10 1.54213170143702
# … with 20 more rows
Another alternative is to use:
library(tidyverse)
map_depth(a_list_of_tibbles, 2, as.character) %>%
bind_rows()
#> # A tibble: 30 × 1
#> a
#> <chr>
#> 1 0.0894618169853206
#> 2 -1.50144637645091
#> 3 1.44795821718513
#> 4 0.0795342912030257
#> 5 -0.837985570593029
#> 6 -0.050845557103668
#> 7 0.031194556366589
#> 8 0.0989551909839589
#> 9 1.87007290229274
#> 10 0.67816212007413
#> # … with 20 more rows
Created on 2021-12-20 by the reprex package (v2.0.1)
I have the following data with ID and value:
id <- c("1103-5","1103-5","1104-2","1104-2","1104-4","1104-4","1106-2","1106-2","1106-3","1106-3","2294-1","2294-1","2294-2","2294-2","2294-2","2294-3","2294-3","2294-3","2294-4","2294-4","2294-5","2294-5","2294-5","2300-1","2300-1","2300-2","2300-2","2300-4","2300-4","2321-1","2321-1","2321-2","2321-2","2321-3","2321-3","2321-4","2321-4","2347-1","2347-1","2347-2","2347-2")
value <- c(6,3,6,3,6,3,6,3,6,3,3,6,9,3,6,9,3,6,3,6,9,3,6,9,6,9,6,9,6,9,3,9,3,9,3,9,3,9,6,9,6)
If you notice, there are multiple values for the same id. What I'd like to do is get the value that are only 3 and 6 only if the IDs are the same. for eg. ID "1103-5" has both 3 and 6, so it should be in the list, but not "2347-2"
I'm using R
One method I tried is the following, but it gives me everything with value 3 and 6.
d <- data.frame(id, value)
group36 <- d[d$value == 3 | d$value == 6,]
and
d %>% group_by(id) %>% filter(3 == value | 6 == value)
The output should be like this:
id value
1103-5 6
1103-5 3
1104-2 6
1104-2 3
1104-4 6
1104-4 3
1106-2 6
1106-2 3
1106-3 6
1106-3 3
2294-1 3
2294-1 6
2294-2 3
2294-2 6
2294-3 3
2294-3 6
2294-4 3
2294-4 6
2294-5 3
2294-5 6
d<-group_by(d,id)
filter(d,any(value==3),any(value==6))
This gives you all the IDs where there is both a value of 3 (somewhere) AND a value of 6 (somewhere). Mind you, your data contains some IDs with THREE values. In these cases, if both 3 and 6 are present, it will be included in the result.
If you want to exclude those lines that remain which done equal 3 or 6, add this:
filter(d,value==3 | value==6)
If you want to exclude IDs that also have 3 and 6 as values but also have OTHER values, use this:
filter(d,any(value==3),any(value==6),value==3 | value==6)
Not sure if this is what you want. We can filter rows that equal to either 3 or 6 then convert from long to wide format and keep only columns which have both 3 and 6 values. After that, convert back to long format.
library(dplyr)
library(tidyr)
id <- c("1103-5","1103-5","1104-2","1104-2","1104-4","1104-4","1106-2","1106-2",
"1106-3","1106-3","2294-1","2294-1","2294-2","2294-2","2294-2",
"2294-3","2294-3","2294-3","2294-4","2294-4","2294-5","2294-5","2294-5",
"2300-1","2300-1","2300-2","2300-2","2300-4","2300-4","2321-1","2321-1",
"2321-2","2321-2","2321-3","2321-3","2321-4","2321-4","2347-1","2347-1","2347-2","2347-2")
value <- c(6,3,6,3,6,3,6,3,6,3,3,6,9,3,6,9,3,6,3,6,9,3,6,9,6,9,6,9,6,9,3,9,3,9,3,9,3,9,6,9,6)
d <- data.frame(id, value)
d %>%
group_by(id) %>%
filter(value %in% c(3, 6)) %>%
mutate(rows = 1:n()) %>%
spread(key = id, value) %>%
select_if(~ all(!is.na(.)))
#> # A tibble: 2 x 11
#> rows `1103-5` `1104-2` `1104-4` `1106-2` `1106-3` `2294-1` `2294-2`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 6 6 6 6 6 3 3
#> 2 2 3 3 3 3 3 6 6
#> # ... with 3 more variables: `2294-3` <dbl>, `2294-4` <dbl>,
#> # `2294-5` <dbl>
d %>%
group_by(id) %>%
filter(value %in% c(3, 6)) %>%
mutate(rows = 1:n()) %>%
spread(key = id, value) %>%
select_if(~ all(!is.na(.))) %>%
select(-rows) %>%
gather(id, value)
#> # A tibble: 20 x 2
#> id value
#> <chr> <dbl>
#> 1 1103-5 6
#> 2 1103-5 3
#> 3 1104-2 6
#> 4 1104-2 3
#> 5 1104-4 6
#> 6 1104-4 3
#> 7 1106-2 6
#> 8 1106-2 3
#> 9 1106-3 6
#> 10 1106-3 3
#> 11 2294-1 3
#> 12 2294-1 6
#> 13 2294-2 3
#> 14 2294-2 6
#> 15 2294-3 3
#> 16 2294-3 6
#> 17 2294-4 3
#> 18 2294-4 6
#> 19 2294-5 3
#> 20 2294-5 6
Created on 2018-07-01 by the reprex package (v0.2.0.9000).