{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9d98f445-a87d-4ad3-88ce-71d08230a4c3", "metadata": {}, "outputs": [], "source": [ "# DEMO: Text Analytics with API call, key not required\n", "# Developed by Dr. Benyawarath \"Yaa\" Nithithanatchinnapat\n", "# Reference: Data Science from Scratch: First Principles with Python 2nd Edition by Joel Grus\n", "\n", "# Cell 1: Make sure we have the new tool\n", "!pip install --quiet beautifulsoup4\n", "%pip install vaderSentiment\n", "%pip install wordcloud" ] }, { "cell_type": "code", "execution_count": null, "id": "3368dd7a-bf8a-482c-8ed3-58e4e2e38274", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup # <-- Our new tool\n", "\n", "# This website is a sandbox *for* scraping. It will not block us.\n", "url = \"http://quotes.toscrape.com/\"\n", "\n", "# We don't even need special headers\n", "response = requests.get(url)\n", "\n", "# Check our status code... just in case!\n", "if response.status_code == 200:\n", " print(\"Success! Got the webpage HTML.\")\n", " # print(response.text) # <-- Uncomment this to show the raw, messy HTML\n", "else:\n", " print(f\"Failed to get website. Status Code: {response.status_code}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "9243eb84-92dc-47f5-b61b-160cf9a10263", "metadata": {}, "outputs": [], "source": [ "# Turn the HTML string into a \"soup\" object\n", "soup = BeautifulSoup(response.text, 'html.parser')\n", "\n", "# .find_all() is the workhorse. It finds every tag that matches our pattern.\n", "quote_html_blocks = soup.find_all('div', class_='quote')\n", "\n", "print(f\"Found {len(quote_html_blocks)} quotes on the page.\")\n", "\n", "# Now we loop through the blocks we found and pull out the specific text\n", "quote_list = []\n", "for block in quote_html_blocks:\n", " text = block.find('span', class_='text').text\n", " author = block.find('small', class_='author').text\n", " \n", " quote_list.append({\n", " 'author': author,\n", " 'text': text\n", " })\n", "\n", "# Convert our list of dictionaries into a clean DataFrame\n", "df = pd.DataFrame(quote_list)\n", "\n", "print(\"\\n--- Clean, Scraped DataFrame ---\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "ada5cb75-0179-4563-af98-76e57ca84234", "metadata": {}, "outputs": [], "source": [ "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", "\n", "analyzer = SentimentIntensityAnalyzer()\n", "\n", "# Use .apply() to score every quote in our 'text' column\n", "df['sentiment_score'] = df['text'].apply(\n", " lambda txt: analyzer.polarity_scores(txt)['compound']\n", ")\n", "\n", "# Show the text alongside its new score, sorted by most positive\n", "print(\"\\n--- Quotes Sorted by Positive Sentiment ---\")\n", "print(df.sort_values(by='sentiment_score', ascending=False).head())" ] }, { "cell_type": "code", "execution_count": null, "id": "0855310d-2062-41ef-abca-10c5d1a7c43b", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\n", "# --- Insight 1: The Overall \"Mood\" ---\n", "overall_sentiment = df['sentiment_score'].mean()\n", "\n", "print(f\"\\n--- Executive Summary ---\")\n", "print(f\"Overall Quote Sentiment Score: {overall_sentiment:.4f} (Generally POSITIVE)\")\n", "\n", "# --- Insight 2: What Are They Talking About? ---\n", "print(\"\\nMost Common Themes in Quotes:\")\n", "all_text = \" \".join(df['text'])\n", "wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)\n", "\n", "plt.figure(figsize=(10, 5))\n", "plt.imshow(wordcloud, interpolation='bilinear')\n", "plt.axis('off')\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }