I'm trying to find the overlap of character data type in two different objects (dataframes in this case) and return the percentage of overlap in R.
My dataframes look something like
dput(lyrics_acousticness_upcoming0.01_frequency[1:100,])
structure(list(tolower.lyrics_acousticness_upcoming0.01. = structure(1:100, .Label = c(" late",
" miss it", " yeah", "1", "100", "1発打ったこのビートを鳴らしてよ",
"3x", "40", "5", "6", "9", "999", "a", "à", "aah", "aaliyah",
"about", "above", "account", "act", "actavis", "actin", "acting",
"acto", "acura", "add", "addicts", "adding", "admit", "afraid",
"after", "afterlife", "again", "against", "age", "ago", "ah",
"ahead", "ai", "aim", "ain", "air", "airin", "al", "album", "alguien",
"algún", "alive", "all", "almost", "alone", "along", "already",
"alright", "always", "am", "amazing", "amends", "âmes", "amor",
"an", "and", "and care", "and every morning", "and neither",
"and nobody", "and on", "and you shoulda too", "ange",
"angry", "another", "any", "anybody", "anyhow", "anymore", "anyone",
"anything", "apologized", "apparel", "arc", "are", "aren", "arenas",
"armes", "around", "arrest", "as", "ask", "asked", "askin", "at",
"atlanta", "attached", "attention", "attitude", "aussi", "austin",
"autres", "avenir", "aventura", "avenue", "away", "aww", "ayy",
"b", "baby", "babylon", "back", "backflips", "backwards", "backwoods",
"bad", "badder", "badge", "ball", "balloon", "band", "bandit",
"bank", "banks", "based", "basement", "bay", "bbk", "be", "be cool",
"beach", "beast", "beat", "beautiful", "beauty", "because", "becky",
"bed", "been", "beens", "bees", "before", "behind", "being",
"believe", "believed", "belong", "below", "bench", "bendin",
"bent", "best", "bet", "better", "big", "bigger", "billin", "billion",
"biscayne", "bitch", "bitches", "bitter", "black", "blade", "blame",
"blamin", "blatt", "blend", "bling", "block", "blossom", "blue",
"blunt", "bo", "boardin", "boat", "body", "bogey", "bone", "born",
"both", "bothers", "bottles", "bottom", "boucle", "bought", "bounce",
"bound", "bout", "bowtie", "boy", "boys", "bracelet", "brain",
"brand", "breads", "break", "breed", "bridge", "briefcase", "brim",
"bring", "britain", "britney", "broke", "buckle", "building",
"built", "bullshit", "bunny", "buried", "burning", "bus", "business",
"bust", "busy", "but", "butt", "buy", "by", "bye", "c", "cada",
"cadillacs", "cafe", "call", "callin", "came", "can", "cannot",
"cap", "capaces", "cards", "care", "carpet", "carried", "cars",
"cartier", "cash", "cashin", "catch", "catfish", "cathedral",
"cause", "cause i", "ce", "ceiling", "celebratin", "cell",
"certains", "cette", "chain", "champagne", "changed", "changes",
"cheapskate", "check", "chercher", "chest", "chill", "choice",
"choose", "chubbs", "ciel", "cielo", "city", "claimed", "class",
"cleansing", "clear", "clearly", "clearport", "clock", "close",
"closed", "closer", "clothes", "cloud", "clouds", "clubbin",
"coals", "cobblestone", "coeur", "coke", "cold", "collect", "college",
"come", "comfort", "comfortable", "coming", "comma", "comme",
"common", "condo", "control", "cool", "corner", "cosign", "could",
"couldn", "count", "countin", "country", "couple", "coups", "courtside",
"cover", "cozy", "crack", "crackhead", "crave", "crazy", "create",
"credit", "crest", "crew", "crib", "crime", "crossed", "crown",
"cure", "cursive", "cut", "cuz", "cuz in", "d", "dabble", "dame",
"damn", "damnation", "damned", "damon", "dance", "dangerous",
"dar", "dare", "darling", "dawg", "dawn", "day", "days", "de",
"dead", "deal", "dealing", "deals", "death", "debatin", "decide",
"decision", "decisions", "decorating", "deep", "deepest", "defend",
"definitely", "dem", "denzel", "des", "deserve", "desire", "destruction",
"detest", "devil", "di", "diamonds", "did", "didn", "die", "died",
"diet", "different", "difícil", "diggin", "direct", "dirty",
"dirtyin", "dis", "disait", "distance", "ditches", "divide",
"do", "dodging", "does", "doesn", "dog", "dogs", "doin", "doing",
"dois", "dolled", "dolly", "dominatin", "don", "done", "donuts",
"door", "doorknob", "dormais", "dormir", "dosed", "double", "doubt",
"down", "downplayin", "drake", "drama", "dre", "dream", "dreams",
"drink", "drive", "driven", "driving", "drizzy", "drop", "drove",
"drowned", "drug", "drum", "dry", "dubplate", "dude", "due",
"dug", "dumb", "dying", "dеfend", "e", "each", "early", "eastside",
"easy", "eat", "eating", "economy", "ed", "edge", "eh", "either",
"el", "else", "em", "embassy", "emotion", "en", "end", "endlessly",
"ends", "enemies", "energizer", "enfuir", "enough", "entends",
"entero", "entourage", "envolent", "es", "escape", "especially",
"espère", "est", "eternal", "eternity", "eux", "even", "ever",
"every", "everybody", "everyday", "everyone", "everything", "everywhere",
"evil", "exactly", "excited", "execute", "expect", "expecting",
"exposed", "exposеd", "extra", "eye", "eyes", "f", "face", "fact",
"fade", "failure", "fair", "fait", "faith", "fake", "fall", "fam",
"fame", "famous", "fantasies", "fast", "fate", "fault", "fausses",
"faut", "favorite", "fear", "fears", "features", "fed", "feel",
"feel straight", "feeling", "feelings", "feet", "fela", "fell",
"felt", "feu", "feus", "fight", "fighting", "final", "finally",
"find", "fine", "fines", "finger", "fingertips", "finna", "firm",
"first", "fish", "fit", "five", "fix", "fixate", "fl", "flame",
"flashy", "flatline", "flex", "float", "floor", "floors", "flow",
"fly", "flyin", "flying", "fo", "fooled", "for", "for myself",
"forced", "fore", "forever", "forget", "fort", "found", "fout",
"fragrance", "frankenstein", "franny", "frère", "fresh", "friday",
"friend", "friendly", "friends", "from", "front", "fuck", "fucked",
"fuckin", "fucking", "fucks", "full", "funny", "furs", "further",
"future", "g", "gala", "gallon", "game", "gang", "gangstas",
"gangster", "gasoline", "gave", "get", "gets", "gettin", "getting",
"gibbo", "gig", "gimme", "gimmе", "girl", "girls", "give", "given",
"glass", "glasses", "glowin", "go", "god", "goddamn", "goes",
"goin", "going", "gold", "gon", "gone", "gonna", "good", "got",
"gotta", "gotten", "gotti", "graduated", "grammy", "grams", "grandeza",
"grandir", "grave", "gravel", "greatest", "greatness", "greaze",
"green", "grew", "grind", "grippin", "grips", "groovy", "guarantee",
"guardin", "guest", "guidance", "guide", "gun", "guy", "guys",
"ha", "had", "hair", "hall", "halloween", "hand", "hands", "hang",
"hangin", "happened", "happiness", "hard", "harder", "has", "hate",
"hates", "haunted", "have", "have some love", "havin", "hawks",
"he", "head", "headlines", "hear", "heard", "hearin", "heart",
"heat", "heaven", "heavenly", "hectic", "hell", "hello", "help",
"hen", "henn", "hennessy", "her", "here", "hey", "hi", "hide",
"high", "higher", "him", "himself", "his", "hit", "hits", "ho",
"hoes", "hold", "holiday", "holler", "holy", "home", "homies",
"honey", "hood", "hookers", "hope", "hot", "hotel", "hotline",
"house", "how", "howard", "huh", "hundred", "hundreds", "hurt",
"hustlin", "hype", "i", "i have smaller", "i try", "i wanted to go",
"ice", "icy", "if", "il", "ill", "ils", "imagine", "immortal",
"important", "in", "inhibitions", "innocent", "inside", "inspirin",
"into", "invested", "invitation", "invited", "involved", "ironic",
"is", "islands", "isn", "it", "italians", "its", "j", "jacket",
"jail", "jang", "jealousy", "jeans", "jeez", "jet", "jeu", "jheeze",
"job", "jokes", "joking", "julie", "just", "karaoke", "keep",
"keep on", "keeping", "keepsake", "kettle", "keychain", "kid",
"kiddin", "kill", "kind", "kinda", "king", "kiss", "knew", "know",
"know i", "knowing", "known", "knows", "kobe", "kush", "kuti",
"l", "la", "là", "labels", "lady", "land", "lang", "language",
"larmes", "last", "late", "lately", "lay", "le", "lead", "leads",
"leaf", "learned", "learning", "least", "leather", "leave", "leavin",
"leaving", "left", "legacy", "legal", "lens", "les", "less",
"lessons", "let", "lettre", "lie", "lies", "life", "lifestyle",
"light", "like", "line", "linea", "lis", "list", "listen", "liter",
"little", "live", "lived", "livin", "living", "ll", "lo", "lonely",
"long", "look", "looking", "looks", "lord", "lose", "lost", "lot",
"lotta", "louis", "love", "loved", "loves", "low", "loyalty",
"luke", "luna", "luther", "lyin", "m", "m2", "m6", "ma", "mac",
"machine", "mack", "mad", "made", "madness", "magazine", "magdalene",
"maint", "major", "make", "makes", "makin", "making", "male",
"male friends", "malibu", "mall", "mally", "mama", "man", "mannie",
"mansions", "many", "margherita", "mark", "marry", "martin",
"martyr", "más", "mascot", "matrix", "matter", "matters", "maybe",
"me", "mean", "mean i", "meant", "medical", "medicine", "melt",
"mêmes", "memphis", "men", "menti", "mention", "menu", "met",
"metal", "mi", "miami", "miedo", "miette", "might", "mil", "mile",
"miles", "million", "millions", "mind", "mine", "mines", "minute",
"mirror", "miss", "missed", "missions", "missus", "mistake",
"mistakes", "mixed", "mixtape", "mj", "mm", "mmm", "mo", "moi",
"moment", "monday", "monde", "money", "monster", "months", "montre",
"moon", "more", "most", "mostrarle", "motel", "motherfucker",
"motherfuckers", "motherfuckin", "motion", "motor", "motto",
"mountain", "mouth", "move", "moved", "moves", "movie", "movin",
"mr", "much", "muddy", "multiplying", "must", "my", "my heart",
"myself", "n", "na", "nadie", "nan", "nant", "nasty", "naturally",
"ne", "need", "needed", "needle", "needs", "negative", "neither",
"nerve", "nerves", "nervous", "never", "nevermind", "new", "news",
"next", "nice", "nicki", "nigga", "niggas", "night", "nights",
"niko", "nine", "nir", "no", "nobody", "noddin", "noise", "none",
"normal", "not", "nother", "nothin", "nothing", "nous", "now",
"nuit", "number", "nеrve", "o", "of", "off", "oh", "ojos", "okay",
"old", "older", "on", "once", "one", "online", "only", "ooh",
"open", "opened", "opi", "opinion", "opposite", "or", "orleans",
"ot", "other", "oti", "ouais", "our", "out", "outs", "outta",
"over", "overrated", "overseas", "ovo", "owe", "own", "p", "paddle",
"pages", "paid", "pain", "panoramic", "papa", "paparazzi", "papers",
"para", "pardon", "park", "part", "partir", "parton", "party",
"pas", "pass", "passion", "passport", "past", "patek", "patience",
"patron", "pay", "peace", "peach", "peaks", "peel", "pen", "pendus",
"people", "pepperoni", "percocet", "perderme", "perdus", "persian",
"person", "personified", "pete", "phone", "photo", "piece", "pimp",
"pimpin", "pink", "pippen", "pissed", "place", "places", "plan",
"planeta", "planner", "plans", "plat", "platinum", "play", "players",
"playground", "playin", "plus", "po", "pocket", "point", "poison",
"pole", "police", "politician", "pool", "pop", "por", "posing",
"poster", "posеd", "pour", "powers", "prada", "pray", "preaching",
"precious", "prefer", "president", "presidential", "pretend",
"pretty", "price", "pride", "prime", "prison", "probably", "problem",
"problems", "profile", "promis", "proof", "protest", "proud",
"prove", "psychic", "puerta", "puisque", "pull", "purpose", "pushin",
"pushing", "pussy", "pussyholes", "put", "puttin", "qu", "qualification",
"quand", "que", "quick", "quit", "quitte", "quizá", "r", "race",
"rain", "ralentit", "ran", "rap", "rapper", "rappin", "ras",
"rate", "rather", "razor", "re", "reached", "reaction", "read",
"ready", "real", "realest", "reality", "realize", "really", "reason",
"record", "records", "red", "referring", "refuse to", "regarde",
"relay", "religion", "remake", "remember", "reply", "reputation",
"researchin", "reservations", "respect", "rest", "restera", "restraint",
"retirement", "rev", "revenge", "rhythm", "rich", "riddim", "ride",
"ridge", "riding", "right", "ring", "rips", "risk", "road", "rock",
"rockin", "roll", "rolled", "rollie", "rolling", "romance", "room",
"rough", "round", "row", "rr", "rubber", "rugs", "ruin", "run",
"runnin", "running", "runs", "rush", "rushing", "s", "sad", "safe",
"sag", "said", "saigne", "sailed", "sake", "salad", "salir",
"same", "santa", "satan", "save", "saviour", "saw", "say", "sayin",
"saying", "says", "scared", "scene", "scènes", "scottie", "scratch",
"sea", "seat", "secret", "secrets", "security", "see", "seem",
"seen", "self", "send", "sense", "sensе", "separa", "service",
"seven", "seventeen", "sex", "shame", "share", "she", "sheet",
"shelter", "sheriff", "shh", "shine", "shing", "ship", "shirt",
"shit", "shocking", "shoes", "shorty", "shots", "should", "shoulda",
"shouldn", "shout", "shovel", "show", "shower", "showin", "showing",
"showtime", "shut", "si", "side", "sides", "sign", "signals",
"silence", "simple", "since", "singin", "singing", "single",
"sit", "sittin", "sitting", "six", "skeet", "sleep", "slice",
"slow", "smaller", "smart", "smoke", "snowin", "snug", "so",
"social", "solid", "solo", "some", "somebody", "someday", "someone",
"something", "sometimes", "somewhere", "son", "sont", "soon",
"soul", "sound", "sound different", "south", "space", "spaceship",
"spanish", "speak", "special", "speed", "spell", "spend", "spit",
"spotlight", "spray", "squad", "stacks", "stand", "star", "start",
"started", "starting", "state", "statement", "stay", "stick",
"still", "stones", "stop", "stories", "story", "straddling",
"straight", "street", "streets", "strength", "stressed", "stripper",
"strong", "stuck", "study", "stunna", "stunt", "stupid", "style",
"such", "sugar", "suicide", "summer", "sunshine", "supermodel",
"suppose", "supposed", "sur", "sure", "surely", "sûrement",
"surface", "surprised", "surrounded", "survived", "suzuki", "swag",
"swam", "swear", "sweepstakes", "sweet", "sweetest", "swervin",
"switch", "switches", "switchin", "symptom", "t", "t say what",
"tag", "tailor", "take", "taken", "takes", "takin", "taking",
"taliban", "talk", "talkin", "talking", "talks", "tall", "taller",
"tanto", "taping", "taught", "te", "tea", "team", "tech", "tell",
"tellin", "telling", "tells", "temps", "tengo", "tenir", "tennessee",
"tenue", "testimony", "text", "texts", "than", "thang", "that",
"the", "them", "then", "there", "these", "they", "thing", "things",
"think", "thinkin", "thinking", "thirsty", "thirty", "this",
"those", "thou", "though", "thought", "thoughts", "thread", "three",
"through", "throwin", "thе", "ti", "ticket", "tied", "tiempo",
"ties", "tight", "til", "till", "time", "times", "timin", "ting",
"tints", "tit", "tits", "to", "toast", "today", "todos", "together",
"toi", "told", "tongue", "tonight", "too", "toodles", "took",
"top", "tore", "toronto", "tote", "touch", "touched", "touching",
"tough", "toujours", "tour", "tout", "town", "tracin", "traitors",
"trampoline", "trauma", "travel", "treadmill", "treat", "tree",
"tried", "trigger", "trillion", "trip", "trippin", "trop", "trophy",
"trouble", "troubled", "true", "trust", "truth", "try", "try to be cool",
"trying", "tryna", "tu", "tune", "tunechi", "tunnel", "turn",
"turnin", "turns", "turnt", "tus", "twenty", "twisted", "two",
"tyme", "type", "ugk", "uh", "uhh", "umbrella", "umpire", "un",
"una", "unbearable", "uncle", "under", "underground", "understand",
"undiscovered", "unless", "unlucky", "unreal", "unruly", "until",
"up", "upside", "uptown", "us", "used", "values", "vas", "vayan",
"ve", "venue", "venus", "verme", "very", "veut", "vez", "viens",
"view", "views", "villain", "vision", "voodoo", "voulais", "waist",
"wait", "walk", "walking", "wall", "walls", "wanna", "want",
"want to", "want to know", "wanted", "wants", "warm", "was",
"wasn", "waste", "wasted", "watch", "watched", "water", "way",
"wayans", "ways", "we", "weak", "weakness", "weapon", "wear",
"wearing", "wedding", "weed", "weekend", "well", "went", "were",
"wet", "what", "whatever", "when", "whenever", "where", "where my",
"whereabouts", "which", "whip", "whippin", "whitney", "who",
"whoever", "whole", "why", "wife", "wildin", "will", "williams",
"win", "wind", "wine", "wings", "winnin", "wise", "wish", "wished",
"wishing", "with", "with a", "without", "woman", "women", "won",
"wonder", "wonderin", "wondering", "woo", "wood", "woodgrain",
"word", "work", "workin", "working", "world", "worry", "worse",
"worth", "would", "wouldn", "wow", "wrist", "wrong", "wrote",
"wе", "xanax", "y", "ya", "yachts", "yale", "yeah", "year",
"years", "yes", "yesterday", "ymca", "ymcmb", "yo", "yolo", "york",
"you", "young", "youngest", "your", "yours", "yourself", "zone",
"ぁあ", "あなたの言葉は この鼻を伸ばす", "あの子の顔 探しても見当たらない",
"あれほど生きてきたけど全ては夢みたい", "あれもこれも",
"あれもこれも魅力的でも私は君がいい", "あんたに聴かすだけだから",
"カッコの悪い 恋の歌", "ここに", "このくらいがちょうどいいかなって",
"さぁ", "さながら", "さよならがあんたに捧ぐ愛の言葉",
"さよならべいべ", "そのエロい体を貸してくれ",
"その一瞬の隙を運命は 見逃してくれない",
"それでいいでしょ", "そんなことはどうでもいいからって",
"だいたいこんなもん", "だいたいそんなもん",
"だから涙は見せずに さよならべいべ", "だから笑って手を振る さよならべいべ",
"だけどそれが分かって本当に良かった", "ちょっと待ったって",
"ついハイになって", "で", "でも何もかもが不快だね",
"どうかしそうやこの胸は", "どこにいたの 探してたよ",
"どこまでも どこまでも", "どっかいっちゃって",
"どれほど朽ち果てようと最後にゃ笑いたい",
"なんとかしてや", "の歌詞", "はみ出したったモノを隠す場所もない",
"まぁいいかな", "まぁいいから", "まやかしの宝 見せかけの光",
"もう二度と犯さない", "もう行く時間か 最後までカッコ悪いわしじゃったな",
"わしかてずっと一緒におりたかったわ", "世間は許しちゃくれやせんよ",
"今日も愉快だね", "伝えた日", "何か分かったようで 何も分かってなくて",
"何のために戦おうとも動機は愛がいい", "何もかも 捨ててくよ",
"余裕のない 愛の言葉", "優しかった いつも支えてくれた 信じてた",
"先天的だ過敏なこの衝動", "別れはみんないつか通る道じゃんか",
"刺さった枯れ木に注いだ火", "前に進むことしか出来ん道じゃから",
"叶えたい想像に", "君とならば さらり さらり",
"君とならばさらり さらり", "塞いだ目", "大炎上大炎上大炎上大炎上大炎上",
"常にここに ここに", "引き返せなくて", "当たり前なんてない",
"恥ずかしいカン違い", "息せき切ってきたの",
"意地はっても すぐに崩れるし", "手を抜いて",
"振られるくらいが僕らしいんだもん", "携帯越しの タッグファイト",
"支えたいと君に", "散々だな後悔はないけど",
"新しい扉を叩き割った", "新しい日々は探さずとも常に ここに",
"新しい日々も 拙い過去も 全てがきらり",
"明け行く夕日の中を今夜も昼下がり さらり",
"時には途方に暮れてただ風に吹かれて ゆらり",
"時間てこんな 冷たかったかな", "来んと思った 時はすぐに来た",
"気付けば", "気恥ずかしいから 置き手紙だけで許してな",
"永遠に きらり", "泣いとる時間もないようになるけどな 今",
"瀬戸際の見栄が この首を絞める", "無くしてしまったものを振り返って ほろり",
"煩わしいから 何にも包まずにおくわ", "珍回答だ僕は気にしないけど",
"理不尽がお決まりの世界だもん", "生きてきたけど全ては夢みたい",
"着色の言葉 無味無臭の心", "私だって 私だって",
"空気の読めぬ 恋の歌", "紛らわしいから まっすぐな言葉にするわ",
"自分で一人 生きてきたって 果たしたって",
"自分にも周りの人にもバレないように", "自分にも周りの人にもバレないように利を振って",
"自分のモンなんてない", "自己満たちが踊る世界だもん",
"色々見てきたけれどこの瞳は永遠に きらり",
"荒れ狂う 季節の中も 群衆の中も", "荒れ狂う季節の中を二人は一人きり さらり",
"藤井風", "行き先は決めたの", "行き違って 行き違って",
"裸のまま透明な服を着た王様だ", "裸足だって 裸足だって",
"見栄はっても すぐに剥がれるし", "誰も知らない ミッドナイト",
"誰も見とらんから少しくらいええかな", "調子のっちゃって",
"調子のんないで", "迷わずに行きたいけど保証はしないよ",
"連れてって 連れてって", "風に乗って 風に乗って",
"飾りのない 愛の言葉", "魅力的でも私は君がいい"
), class = "factor"), Freq = c(1L, 2L, 4L, 2L, 4L, 2L, 1L, 2L,
2L, 4L, 1L, 2L, 670L, 1L, 4L, 6L, 87L, 1L, 22L, 28L, 2L, 2L,
3L, 2L, 12L, 3L, 1L, 15L, 6L, 4L, 3L, 2L, 20L, 1L, 5L, 8L, 38L,
16L, 2L, 2L, 90L, 68L, 2L, 2L, 1L, 2L, 2L, 14L, 220L, 6L, 43L,
14L, 14L, 1L, 53L, 91L, 10L, 2L, 4L, 2L, 12L, 577L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 4L, 5L, 7L, 9L, 4L, 18L, 8L, 12L, 2L, 5L, 4L,
78L, 2L, 2L, 4L, 32L, 1L, 38L, 39L, 13L, 6L, 54L, 4L, 2L, 6L,
5L, 2L, 1L, 1L, 1L, 3L)), row.names = c(NA, 100L), class = "data.frame")
dput(sad_words[1:100,])
structure(list(word = c("abandon", "abandoned", "abandonment",
"abduction", "abortion", "abortive", "abscess", "absence", "absent",
"absentee", "abuse", "abysmal", "abyss", "accident", "accursed",
"ache", "aching", "adder", "adrift", "adultery", "adverse", "adversity",
"afflict", "affliction", "affront", "aftermath", "aggravating",
"agony", "ail", "ailing", "alcoholism", "alienated", "alienation",
"anathema", "anchorage", "anguish", "animosity", "annihilated",
"annihilation", "annulment", "anthrax", "antisocial", "anxiety",
"apathetic", "apathy", "apologize", "appendicitis", "arid", "arraignment",
"arsenic", "art", "ashamed", "ashes", "assailant", "assassin",
"assassination", "atherosclerosis", "atrocity", "atrophy", "attacking",
"attenuation", "austere", "autopsy", "avalanche", "awful", "backwater",
"bacteria", "bad", "badly", "bang", "banish", "banished", "banishment",
"bankrupt", "bankruptcy", "banshee", "barren", "bastard", "battered",
"battled", "beating", "beg", "beggar", "belittle", "bereaved",
"bereavement", "betray", "betrayal", "bier", "bigoted", "bitch",
"bitterly", "bitterness", "blackness", "bleak", "bleeding", "blemish",
"blight", "blighted", "blindly"), sadness = c(TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
Here the tolower.lyrics_acousticness_upcoming0.01. column entries from the lyrics_acousticness_upcoming0.01_frequency dataframe has to be checked for overlap content with word column from the sad_words dataframe and out of all the words present in the lyrics_acousticness_upcoming0.01_frequency dataframe, the percent of overlapping content is what is needed.
All suggestions will be welcomed.
First of all I suggest always put a small input dataset so that its easily replicable.
Is this what you are looking for as a percentage?
overlapped = lyrics_acousticness_upcoming0.01_frequency %>%
select(tolower.lyrics_acousticness_upcoming0.01.) %>%
unique() %>%
rename(word = "tolower.lyrics_acousticness_upcoming0.01.") %>%
inner_join(sad_words %>% select(word) %>% unique())
total_words_lyrics_acousticness_upcoming0.01_frequency =
lyrics_acousticness_upcoming0.01_frequency %>%
select(all_word) %>%
unique() %>%
tally()
total_words_matching = overlapped %>%
select(word) %>%
unique() %>%
tally()
pct = total_words_matching$n %>%
unique()
total_words_lyrics_acousticness_upcoming0.01_frequency$n %>%
unique()
I assumed the all_word as one of your column that you mentioned "out of all the words present in the lyrics_acousticness_upcoming0.01_frequency dataframe"
You can get the proportion of elements from one vector in another with mean(x %in% y). For your data, this would be:
mean(lyrics_acousticness_upcoming0.01_frequency$tolower.lyrics_acousticness_upcoming0.01. %in% sad_words$word)
# 0
None of the words in your first dataframe appear in the second, so the % overlap is 0.
If we make up an example that does have some overlap:
set.seed(13)
data(words, package = "stringr")
df1 <- data.frame(x = sample(words, 500))
df2 <- data.frame(y = sample(words, 500))
mean(df1$x %in% df2$y)
# 0.482
About 48% of words in df1$x appear in df2$y.
library(rvest)
library(dplyr)
library(tidyr)
library(spotifyr)
library(tidytext)
library(textdata)
Using the above libraries I scraped artist data from Spotify using the API token.
I've got the data of words with sentiments (i.e. anger let's say) and the details about the songs.
I now want to run it in loop over multiple such word category (eg. anger) and see which words are most used in that particular emotion, and in general too I want to plot a histogram for the words used in the songs.
So I use the following functions:
data %>%
unnest() %>%
unnest_tokens(word, lyric) %>%
anti_join(stop_words, by = "word") %>%
left_join(angry_words, by = "word") %>%
group_by(track_name, energy, album_name, duration_ms, valence) %>%
summarize(angry_words = sum(anger, na.rm = TRUE)) %>%
ungroup() %>%
select(track_name, album_name, angry_words) %>%
arrange(desc(angry_words))
Every-time I run the code I get the following error:
Error in `fn()`:
! In row 64, can't recycle input of size 3 to size 2.
Run `rlang::last_error()` to see where the error occurred.
Warning message:
`cols` is now required when using unnest().
Please use `cols = c(album_images, artists, available_markets)`
All suggestions will be helpful.
Here the data and angry_words data frames are:
data <- structure(list(artist_name = c("María José Llergo", "María José Llergo"
), artist_id = c("70GBRlKEGjfueop2lfdQ4Q", "70GBRlKEGjfueop2lfdQ4Q"
), album_id = c("6BMyWViSAgXtUVlPfXiGES", "6BMyWViSAgXtUVlPfXiGES"
), album_type = c("album", "album"), album_images = list(structure(list(
height = c(640L, 300L, 64L), url = c("https://i.scdn.co/image/ab67616d0000b2735f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d00001e025f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d000048515f3d845e18e06df1bbe95178"
), width = c(640L, 300L, 64L)), class = "data.frame", row.names = c(NA,
3L)), structure(list(height = c(640L, 300L, 64L), url = c("https://i.scdn.co/image/ab67616d0000b2735f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d00001e025f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d000048515f3d845e18e06df1bbe95178"
), width = c(640L, 300L, 64L)), class = "data.frame", row.names = c(NA,
3L))), album_release_date = c("2020-01-31", "2020-01-31"), album_release_year = c(2020,
2020), album_release_date_precision = c("day", "day"), danceability = c(0.612,
0.5), energy = c(0.342, 0.267), key = c(4L, 7L), loudness = c(-9.193,
-11.736), mode = 0:1, speechiness = c(0.0419, 0.0448), acousticness = c(0.358,
0.815), instrumentalness = c(0.000502, 2.66e-06), liveness = c(0.257,
0.0981), valence = c(0.122, 0.264), tempo = c(99.993, 114.192
), track_id = c("7pB0e4E78UfAmKBPzQPo8a", "1sgH6adzL1BBaIXRC7NOYI"
), analysis_url = c("https://api.spotify.com/v1/audio-analysis/7pB0e4E78UfAmKBPzQPo8a",
"https://api.spotify.com/v1/audio-analysis/1sgH6adzL1BBaIXRC7NOYI"
), time_signature = 3:4, artists = list(structure(list(href = "https://api.spotify.com/v1/artists/70GBRlKEGjfueop2lfdQ4Q",
id = "70GBRlKEGjfueop2lfdQ4Q", name = "María José Llergo",
type = "artist", uri = "spotify:artist:70GBRlKEGjfueop2lfdQ4Q",
external_urls.spotify = "https://open.spotify.com/artist/70GBRlKEGjfueop2lfdQ4Q"), class = "data.frame", row.names = 1L),
structure(list(href = "https://api.spotify.com/v1/artists/70GBRlKEGjfueop2lfdQ4Q",
id = "70GBRlKEGjfueop2lfdQ4Q", name = "María José Llergo",
type = "artist", uri = "spotify:artist:70GBRlKEGjfueop2lfdQ4Q",
external_urls.spotify = "https://open.spotify.com/artist/70GBRlKEGjfueop2lfdQ4Q"), class = "data.frame", row.names = 1L)),
available_markets = list(c("AD", "AE", "AG", "AL", "AM",
"AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", "BE", "BF",
"BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW",
"BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO",
"CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO",
"DZ", "EC", "EE", "EG", "ES", "FI", "FJ", "FM", "FR", "GA",
"GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW",
"GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN",
"IQ", "IS", "IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
"KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK",
"LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "ME",
"MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV",
"MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO",
"NP", "NR", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL",
"PS", "PT", "PW", "PY", "QA", "RO", "RS", "RW", "SA", "SB",
"SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", "ST",
"SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR",
"TT", "TV", "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC",
"VE", "VN", "VU", "WS", "XK", "ZA", "ZM", "ZW"), c("AD",
"AE", "AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BA",
"BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO",
"BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH",
"CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE",
"DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "FI",
"FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN",
"GQ", "GR", "GT", "GW", "GY", "HK", "HN", "HR", "HT", "HU",
"ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "JP",
"KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", "LA",
"LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY",
"MA", "MC", "MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO",
"MR", "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", "NE",
"NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", "PA", "PE",
"PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO",
"RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL",
"SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ",
"TL", "TN", "TO", "TR", "TT", "TV", "TW", "TZ", "UA", "UG",
"US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", "ZA",
"ZM", "ZW")), disc_number = c(1L, 1L), duration_ms = c(197316L,
313028L), explicit = c(FALSE, FALSE), track_href = c("https://api.spotify.com/v1/tracks/7pB0e4E78UfAmKBPzQPo8a",
"https://api.spotify.com/v1/tracks/1sgH6adzL1BBaIXRC7NOYI"
), is_local = c(FALSE, FALSE), track_name = c("¿De Qué Me Sirve Llorar?",
"Niña De Las Dunas"), track_preview_url = c("https://p.scdn.co/mp3-preview/1ed3fba536f1813af99c88f69893dfe6272df847?cid=cf686ca455c74783b8f27d0c35dfc5b0",
"https://p.scdn.co/mp3-preview/e4f9386ef79ff5027800aa9ccd8560a622df28d0?cid=cf686ca455c74783b8f27d0c35dfc5b0"
), track_number = 1:2, type = c("track", "track"), track_uri = c("spotify:track:7pB0e4E78UfAmKBPzQPo8a",
"spotify:track:1sgH6adzL1BBaIXRC7NOYI"), external_urls.spotify = c("https://open.spotify.com/track/7pB0e4E78UfAmKBPzQPo8a",
"https://open.spotify.com/track/1sgH6adzL1BBaIXRC7NOYI"),
album_name = c("Sanación", "Sanación"), key_name = c("E",
"G"), mode_name = c("minor", "major"), key_mode = c("E minor",
"G major")), row.names = 1:2, class = "data.frame")
angry_words <- structure(list(word = c("abandoned", "abandonment", "abhor",
"abhorrent", "abolish", "abomination", "abuse", "accursed", "accusation",
"accused", "accuser", "accusing", "actionable", "adder", "adversary",
"adverse", "adversity", "advocacy", "affront", "aftermath", "aggravated",
"aggravating", "aggravation", "aggression", "aggressive", "aggressor",
"agitated", "agitation", "agony", "alcoholism", "alienate", "alienation",
"allegation", "altercation", "ambush", "anarchism", "anarchist",
"anarchy", "anathema", "anger", "angry", "anguish", "animosity",
"animus", "annihilate", "annihilated", "annihilation", "annoy",
"annoyance", "annoying", "antagonism", "antagonist", "antagonistic",
"antichrist", "antipathy", "antisocial", "antithesis", "anxiety",
"argue", "argument", "argumentation", "arguments", "armament",
"armed", "arraignment", "arrogant", "arson", "assail", "assailant",
"assassin", "assassinate", "assassination", "assault", "asshole",
"atrocious", "atrocity", "attack", "attacking", "attorney", "avarice"
), anger = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)), row.names = c(NA, -80L), class = c("tbl_df",
"tbl", "data.frame"))
So I need to prepare a term-document-matrix for each of the sets of text I want to subsequently run against each other in a classification procedure (rolling.classify() in Stylo package).
So I created a tdm of the whole text corpus, then want to make two sets only of selected texts, one which should contain one text only. So multiple texts works fine (a), but one texts only does not (b), I cannot do this?
freq.list <- make.frequency.list(words, head = 265) # Creating frequency list using only the frequencies of the
# selected features from word-list (words)
word.frequencies <- make.table.of.frequencies(corpus = x, features = freq.list)
# Document-term matrix of whole corpus and matching frequencies.
# Making two subsets now:
a <- word.frequencies[c(1,2,3,17,19,20,21,22,23), 1:263]
dim(a) # Double-check that it is the right no. of texts
b <- word.frequencies[18,1:263]
dim(b) # Double-check
> dim(a)
[1] 9 263
> dim(b)
NULL
data:
(used dput())
x <- structure(list(middleFr_Calmative_1946 = c("the", "calmative",
"i", "don’t", "know", "when", "i", "died", ".", "it", "always",
"seemed", "to", "me", "i", "died", "old", ",", "about", "ninety",
"years", "old", ",", "and", "what", "years", ",", "and", "that",
"my", "body", "bore", "it", "out", ",", "from", "head", "to",
"foot", ".", "but", "this", "evening", ",", "alone", "in", "my",
"icy", "bed", ",", "i", "have", "the", "feeling", "i’ll", "be",
"older", "than", "the", "day", ",", "the", "night", ",", "when",
"the", "sky", "with", "all", "its", "lights", "fell", "upon",
"me", ",", "the", "same", "i", "had", "so", "often", "gazed",
"resolved", "to", "speak", "to", "him", ".", "so", "i", "marshalled",
"the", "words", "and", "opened", "my", "mouth", ",", "thinking",
"i", "would", "hear", "them", ".", "but", "all", "i", "heard",
"was", "a", "kind", "of", "rattle", ",", "unintelligible", "even",
"have", "a", "penny", "in", "my", "pocket", ",", "nor", "anything",
"resembling", "it", "."), middleFr_End_1946 = c("the", "end",
"they", "clothed", "me", "and", "gave", "me", "money", ".", "i",
"back", "mine", ".", "i", "added", ",", "give", "me", "back",
"my", "greatcoat", ".", "they", "replied", "that", "they", "had",
"burnt", "them", ",", "together", "with", "my", "other", "clothes",
".", "i", "understood", "then", "that", "the", "end", "was",
"near", ",", "at", "least", "fairly", "near", ".", "later", "on",
"i", "tried", "to", "exchange", "this", "hat", "for", "a", "cap",
",", "or", "a", "slouch", "which", "could", "be", "pulled", "down",
"over", "my", "face", ",", "but", "without", "much", "success",
".", "and", "yet", "i", "could", "not", "go", "about", "bare",
"-", "headed", ",", "with", "my", "skull", "in", "the", "state",
"it", "was", ".", "at", "first", "this", "hat", "was", "too",
"small", ",", "then", "it", "got", "used", "to", "me", ".", "they",
"gave", "me", "a", "tie", ",", "after", "long", "discussion",
".", "it", "seemed", "a", "pretty", "tie", "to", "me", ",", "but",
"i", "didn’t", "like", "it", ".", "when", "it", "came", "at",
"last", "i", "was", "too", "tired", "to", "send", "it", "back",
".", "but", "in", "the", "end", "it", "came", "in", "useful",
".", "it", "was", "blue", ",", "with", "kinds", "of", "little",
"stars", ".", "i", "didn’t", "feel", "well", ",", "but", "they",
"told", "me", "i", "was", "well", "enough", "."), middleFr_Expelled_1946 = c("the",
"expelled", "there", "were", "not", "many", "steps", ".", "i",
"had", "counted", "them", "a", "thousand", "times", ",", "both",
"going", "up", "and", "coming", "down", ",", "but", "the", "figure",
"has", "gone", "from", "my", "mind", ".", "i", "have", "never",
"known", "whether", "you", "should", "say", "one", "with", "your",
"every", "day", "several", "times", "a", "day", ",", "until",
"they", "sink", "forever", "in", "the", "mud", ".", "that’s",
"an", "order", ".")), class = "stylo.corpus", call = load.corpus.and.parse(files = "all",
corpus.dir = "x", markup.type = "plain", corpus.lang = "English.all",
splitting.rule = ("[ \t\n]+"), sampling = "no.sampling",
features = "w", ngram.size = 1, preserve.case = FALSE, encoding = "UTF-8"))
freq.list <- c("", "-", "—", ",", ";", ":", "!", "?", ".", "’", "\"",
"(", ")", "a", "about", "above", "across", "after", "again",
"against", "ah", "all", "almost", "along", "Already", "also",
"always", "am", "among", "an", "and", "another", "any", "anything",
"are", "as", "at", "away", "back", "be", "because", "been", "before",
"behind", "being", "best", "better", "between", "beyond", "both",
"but", "by", "came", "can", "can't", "can’t", "cannot", "come",
"comes", "could", "did", "didn’t", "different", "do", "does",
"doing", "don't", "don’t", "done", "down", "each", "either",
"else", "even", "ever", "every", "everything", "except", "far",
"few", "fifteen", "first", "five", "for", "forward", "four",
"from", "get", "go", "goes", "going", "got", "great", "had",
"half", "has", "have", "having", "he", "her", "here", "herself",
"him", "himself", "his", "how", "however", "hundred", "i", "i'll",
"i'm", "i’ll", "if", "in", "indeed", "instead", "into", "is",
"it", "it's", "it’s", "its", "itself", "just", "last", "late",
"least", "left", "less", "let", "like", "little", "long", "made",
"make", "many", "may", "me", "merely", "might", "mine", "more",
"most", "moved", "much", "must", "my", "myself", "near", "neither",
"never", "next", "no", "none", "nor", "not", "nothing", "now",
"of", "off", "often", "oh", "on", "once", "one", "only", "or",
"other", "others", "otherwise", "our", "out", "over", "own",
"perhaps", "place", "quite", "rather", "really", "right", "said",
"same", "say", "second", "shall", "she", "should", "since", "six",
"small", "so", "some", "someone", "something", "sometimes", "somewhere",
"soon", "still", "such", "ten", "than", "that", "that's", "that’s",
"the", "their", "them", "themselves", "then", "there", "therefore",
"these", "they", "thing", "things", "third", "this", "those",
"though", "three", "through", "thus", "till", "time", "times",
"to", "together", "too", "towards", "two", "under", "unless",
"until", "up", "upon", "us", "very", "was", "way", "we", "well",
"went", "were", "what", "whatever", "when", "where", "whether",
"which", "while", "who", "whom", "whose", "why", "will", "with",
"within", "without", "won't", "would", "yes", "yet", "you", "your",
"yourself")
You can do:
b <- word.frequencies[18,1:263, drop = F]
dim(b)
# [1] 1 263