{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d98f445-a87d-4ad3-88ce-71d08230a4c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# DEMO: Text Analytics with API call, key not required\n",
    "# Developed by Dr. Benyawarath \"Yaa\" Nithithanatchinnapat\n",
    "# Reference: Data Science from Scratch: First Principles with Python 2nd Edition by Joel Grus\n",
    "\n",
    "# Cell 1: Make sure we have the new tool\n",
    "!pip install --quiet beautifulsoup4\n",
    "%pip install vaderSentiment\n",
    "%pip install wordcloud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3368dd7a-bf8a-482c-8ed3-58e4e2e38274",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup # <-- Our new tool\n",
    "\n",
    "# This website is a sandbox *for* scraping. It will not block us.\n",
    "url = \"http://quotes.toscrape.com/\"\n",
    "\n",
    "# We don't even need special headers\n",
    "response = requests.get(url)\n",
    "\n",
    "# Check our status code... just in case!\n",
    "if response.status_code == 200:\n",
    "    print(\"Success! Got the webpage HTML.\")\n",
    "    # print(response.text) # <-- Uncomment this to show the raw, messy HTML\n",
    "else:\n",
    "    print(f\"Failed to get website. Status Code: {response.status_code}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9243eb84-92dc-47f5-b61b-160cf9a10263",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Turn the HTML string into a \"soup\" object\n",
    "soup = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "# .find_all() is the workhorse. It finds every tag that matches our pattern.\n",
    "quote_html_blocks = soup.find_all('div', class_='quote')\n",
    "\n",
    "print(f\"Found {len(quote_html_blocks)} quotes on the page.\")\n",
    "\n",
    "# Now we loop through the blocks we found and pull out the specific text\n",
    "quote_list = []\n",
    "for block in quote_html_blocks:\n",
    "    text = block.find('span', class_='text').text\n",
    "    author = block.find('small', class_='author').text\n",
    "    \n",
    "    quote_list.append({\n",
    "        'author': author,\n",
    "        'text': text\n",
    "    })\n",
    "\n",
    "# Convert our list of dictionaries into a clean DataFrame\n",
    "df = pd.DataFrame(quote_list)\n",
    "\n",
    "print(\"\\n--- Clean, Scraped DataFrame ---\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ada5cb75-0179-4563-af98-76e57ca84234",
   "metadata": {},
   "outputs": [],
   "source": [
    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
    "\n",
    "analyzer = SentimentIntensityAnalyzer()\n",
    "\n",
    "# Use .apply() to score every quote in our 'text' column\n",
    "df['sentiment_score'] = df['text'].apply(\n",
    "    lambda txt: analyzer.polarity_scores(txt)['compound']\n",
    ")\n",
    "\n",
    "# Show the text alongside its new score, sorted by most positive\n",
    "print(\"\\n--- Quotes Sorted by Positive Sentiment ---\")\n",
    "print(df.sort_values(by='sentiment_score', ascending=False).head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0855310d-2062-41ef-abca-10c5d1a7c43b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "# --- Insight 1: The Overall \"Mood\" ---\n",
    "overall_sentiment = df['sentiment_score'].mean()\n",
    "\n",
    "print(f\"\\n--- Executive Summary ---\")\n",
    "print(f\"Overall Quote Sentiment Score: {overall_sentiment:.4f} (Generally POSITIVE)\")\n",
    "\n",
    "# --- Insight 2: What Are They Talking About? ---\n",
    "print(\"\\nMost Common Themes in Quotes:\")\n",
    "all_text = \" \".join(df['text'])\n",
    "wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)\n",
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "plt.imshow(wordcloud, interpolation='bilinear')\n",
    "plt.axis('off')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}