{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#import library\n", "import urllib\n", "import requests\n", "import bs4\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# request a link\n", "def request_url(link):\n", " response = requests.get(link)\n", " html = response.text\n", " return html" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Function to parse html\n", "def parse_html(to_parse):\n", " soup = bs4.BeautifulSoup(to_parse, 'html.parser')\n", " return soup" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#take sub menu news form liputan6.com\n", "def all_section(main_url):\n", " section_list = []\n", " for i in main_url:\n", " soup = parse_html(request_url(i))\n", " for a in soup.find_all('a', href=True):\n", " if a.text:\n", " section_list.append(a['href'])\n", " return section_list" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#get only link about corona or covid\n", "def find_corona(main_url):\n", " url_list = []\n", " not_news=[]\n", " for i in main_url:\n", " if i.find('corona')!=-1 or i.find('covid')!=-1 or i.find('pandemi')!=-1:\n", " url_list.append(i)\n", " url_list = list(dict.fromkeys(url_list))\n", " for i in url_list:\n", " if (i.find('read') == -1):\n", " not_news.append(i) \n", " for j in not_news:\n", " url_list.remove(j)\n", " return url_list" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def date(main_url):\n", " datetime = []\n", " for i in main_url:\n", " soup=parse_html(request_url(i))\n", " get_datetime = soup.find(\"div\", class_=\"namerep\")\n", " if(get_datetime):\n", " datetime.append(get_datetime.b.string)\n", " else:\n", " datetime.append('None')\n", " return datetime" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def writer(main_url):\n", " author = []\n", " for i in main_url:\n", " soup=parse_html(request_url(i))\n", " get_author = soup.find(\"div\", class_=\"namerep\")\n", " if(get_author):\n", " get_author = ' '.join(get_author.text.split())\n", " author.append(get_author.partition(\", \")[0])\n", " else:\n", " author.append('None')\n", " return author" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def title(main_url):\n", " titles = []\n", " for i in main_url:\n", " soup=parse_html(request_url(i))\n", " try:\n", " get_title = soup.find(\"div\", class_=\"title\")\n", " titles.append(get_title.h1.text)\n", " except:\n", " titles.append('None')\n", " return titles" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def collect_text(main_url, titles = [], author = [], datetime = []):\n", " paragraf = []\n", " data = []\n", " join =[]\n", " isiteks = []\n", " \n", " for i,j in enumerate(main_url):\n", " a = parse_html(request_url(j))\n", " content = a.find('div', class_='read')\n", " pragraf = content.find_all('p')\n", " for k in pragraf:\n", " s = ' '.join(k.text.split())\n", " paragraf.append(s)\n", " data.append(paragraf)\n", " paragraf = []\n", " for i in data:\n", " join.append(' '.join(i))\n", " \n", " for i, j in enumerate(main_url):\n", " isiteks.append({'news': 'Okezone.com', 'link' : j, 'title': titles[i], 'author' : author[i], 'date_time': datetime[i], 'paragraf' : join[i]})\n", " return isiteks" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def save_file(file, file_name = ''):\n", " new_file = pd.DataFrame(file, columns=['news','link','title', 'author', 'date_time', 'paragraf'])\n", " new_file.to_csv(file_name + '.csv', index=True, encoding='utf-8', sep = ',')\n", " \n", " return new_file" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def get_news(main_url, file_name = ''):\n", " section = all_section(main_url)\n", " corona_news = find_corona(section)\n", " titles = title(corona_news)\n", " author = writer(corona_news)\n", " datetime = date(corona_news)\n", " text = collect_text(corona_news, titles, author, datetime)\n", " file = save_file(text, file_name)\n", " \n", " return file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "url=[\"https://www.okezone.com/\"]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>news</th>\n", " <th>link</th>\n", " <th>title</th>\n", " <th>author</th>\n", " <th>date_time</th>\n", " <th>paragraf</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Okezone.com</td>\n", " <td>https://lifestyle.okezone.com/read/2020/05/07/...</td>\n", " <td>BKKBN Khawatirkan Ledakan Penduduk Pasca-Pande...</td>\n", " <td>Muhammad Sukardi</td>\n", " <td>Kamis 07 Mei 2020 12:00 WIB</td>\n", " <td>PANDEMI virus corona COVID-19 memang membuat b...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Okezone.com</td>\n", " <td>https://lifestyle.okezone.com/read/2020/05/07/...</td>\n", " <td>Bermutasi, Virus Corona COVID-19 Makin Jinak?</td>\n", " <td>Leonardus Selwyn Kangsaputra</td>\n", " <td>Kamis 07 Mei 2020 11:45 WIB</td>\n", " <td>PENELITI di Arizona State University (ASU), ba...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Okezone.com</td>\n", " <td>https://sports.okezone.com/read/2020/05/07/40/...</td>\n", " <td>Cegah Virus Corona, Menpora Beri Wejangan kepa...</td>\n", " <td>Rivan Nasri Rachman</td>\n", " <td>Kamis 07 Mei 2020 11:15 WIB</td>\n", " <td>JAKARTA – PBSI baru saja merayakan hari jadi m...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Okezone.com</td>\n", " <td>https://www.okezone.com/tren/read/2020/05/07/6...</td>\n", " <td>Masjidil Haram Akan Dibuka, Corona di Mekkah M...</td>\n", " <td>Mohammad Saifulloh</td>\n", " <td>Kamis 07 Mei 2020 12:08 WIB</td>\n", " <td>RENCANA pembukaan dua Masjid Suci di Arab Saud...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Okezone.com</td>\n", " <td>https://lifestyle.okezone.com/read/2020/05/07/...</td>\n", " <td>Bermutasi, Virus Corona COVID-19 Makin Jinak?</td>\n", " <td>Leonardus Selwyn Kangsaputra</td>\n", " <td>Kamis 07 Mei 2020 11:45 WIB</td>\n", " <td>PENELITI di Arizona State University (ASU), ba...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Okezone.com</td>\n", " <td>https://news.okezone.com/read/2020/05/07/18/22...</td>\n", " <td>Wabah COVID-19, Umat Buddha di Berbagai Negara...</td>\n", " <td>Rahman Asmardika</td>\n", " <td>Kamis 07 Mei 2020 10:16 WIB</td>\n", " <td>UMAT Buddha di berbagai belahan dunia merayaka...</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>Okezone.com</td>\n", " <td>https://nasional.okezone.com/read/2020/05/07/3...</td>\n", " <td>Relaksasi Moda Transportasi, Pengamat: Asa Mem...</td>\n", " <td>Harits Tryan Akhmad</td>\n", " <td>Kamis 07 Mei 2020 07:03 WIB</td>\n", " <td>JAKARTA - Pengamat kebijakan publik, Trubus Ra...</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>Okezone.com</td>\n", " <td>https://news.okezone.com/read/2020/05/07/18/22...</td>\n", " <td>Trump: Pandemi Virus Corona Adalah Serangan Te...</td>\n", " <td>Rahman Asmardika</td>\n", " <td>Kamis 07 Mei 2020 14:30 WIB</td>\n", " <td>WASHINGTON - Presiden Amerika Serikat (AS) Don...</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Okezone.com</td>\n", " <td>https://www.okezone.com/tren/read/2020/05/07/6...</td>\n", " <td>Seperti Umat Islam, Kelompok Ini Berzakat untu...</td>\n", " <td>Muhammad Sukardi</td>\n", " <td>Kamis 07 Mei 2020 14:21 WIB</td>\n", " <td>Sekelompok profesional dari komunitas Dalit te...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>Okezone.com</td>\n", " <td>https://news.okezone.com/read/2020/05/07/512/2...</td>\n", " <td>Terapkan PSBB, Status Kota Tegal Kembali Hijau...</td>\n", " <td>Taufik Budi</td>\n", " <td>Kamis 07 Mei 2020 14:14 WIB</td>\n", " <td>TEGAL - Pembatasan Sosial Berskala Besar (PSBB...</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>Okezone.com</td>\n", " <td>https://megapolitan.okezone.com/read/2020/05/0...</td>\n", " <td>3 Penumpang Positif Covid-19, Walkot Bekasi: A...</td>\n", " <td>Wisnu Yusep</td>\n", " <td>Kamis 07 Mei 2020 14:10 WIB</td>\n", " <td>BEKASI - Wali Kota Bekasi Rahmat Effendi menga...</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>Okezone.com</td>\n", " <td>https://economy.okezone.com/read/2020/05/07/32...</td>\n", " <td>Terpukul Covid-19, Qatar Airways Bakal PHK Peg...</td>\n", " <td>Kamis 07 Mei 2020 14:07 WIB</td>\n", " <td>Kamis 07 Mei 2020 14:07 WIB</td>\n", " <td>JAKARTA - Maskapai penerbangan internasional Q...</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>Okezone.com</td>\n", " <td>https://megapolitan.okezone.com/read/2020/05/0...</td>\n", " <td>Usaha Dekorasi Pernikahan Banting Setir Buat P...</td>\n", " <td>Putra Ramadhani Astyawan</td>\n", " <td>Kamis 07 Mei 2020 13:41 WIB</td>\n", " <td>BOGOR - Siapa sangka pembuatan peti jenazah kh...</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>Okezone.com</td>\n", " <td>https://news.okezone.com/read/2020/05/07/340/2...</td>\n", " <td>Positif Corona, 13 Warga Desa di Polewali Mand...</td>\n", " <td>Huzair Zainal</td>\n", " <td>Kamis 07 Mei 2020 13:41 WIB</td>\n", " <td>POLEWALI MANDAR – Sebanyak 13 orang yang beras...</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>Okezone.com</td>\n", " <td>https://economy.okezone.com/read/2020/05/07/32...</td>\n", " <td>Relaksasi Kebijakan Mudik Rawan Memperlama Pen...</td>\n", " <td>Giri Hartomo</td>\n", " <td>Kamis 07 Mei 2020 13:35 WIB</td>\n", " <td>JAKARTA - Pemerintah membuka kembali transport...</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>Okezone.com</td>\n", " <td>https://economy.okezone.com/read/2020/05/07/32...</td>\n", " <td>Moda Transportasi Kembali Dibuka, Pandemi Covi...</td>\n", " <td>Giri Hartomo</td>\n", " <td>Kamis 07 Mei 2020 13:31 WIB</td>\n", " <td>JAKARTA - Pemerintah memutuskan untuk membuka ...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " news link \\\n", "0 Okezone.com https://lifestyle.okezone.com/read/2020/05/07/... \n", "1 Okezone.com https://lifestyle.okezone.com/read/2020/05/07/... \n", "2 Okezone.com https://sports.okezone.com/read/2020/05/07/40/... \n", "3 Okezone.com https://www.okezone.com/tren/read/2020/05/07/6... \n", "4 Okezone.com https://lifestyle.okezone.com/read/2020/05/07/... \n", "5 Okezone.com https://news.okezone.com/read/2020/05/07/18/22... \n", "6 Okezone.com https://nasional.okezone.com/read/2020/05/07/3... \n", "7 Okezone.com https://news.okezone.com/read/2020/05/07/18/22... \n", "8 Okezone.com https://www.okezone.com/tren/read/2020/05/07/6... \n", "9 Okezone.com https://news.okezone.com/read/2020/05/07/512/2... \n", "10 Okezone.com https://megapolitan.okezone.com/read/2020/05/0... \n", "11 Okezone.com https://economy.okezone.com/read/2020/05/07/32... \n", "12 Okezone.com https://megapolitan.okezone.com/read/2020/05/0... \n", "13 Okezone.com https://news.okezone.com/read/2020/05/07/340/2... \n", "14 Okezone.com https://economy.okezone.com/read/2020/05/07/32... \n", "15 Okezone.com https://economy.okezone.com/read/2020/05/07/32... \n", "\n", " title \\\n", "0 BKKBN Khawatirkan Ledakan Penduduk Pasca-Pande... \n", "1 Bermutasi, Virus Corona COVID-19 Makin Jinak? \n", "2 Cegah Virus Corona, Menpora Beri Wejangan kepa... \n", "3 Masjidil Haram Akan Dibuka, Corona di Mekkah M... \n", "4 Bermutasi, Virus Corona COVID-19 Makin Jinak? \n", "5 Wabah COVID-19, Umat Buddha di Berbagai Negara... \n", "6 Relaksasi Moda Transportasi, Pengamat: Asa Mem... \n", "7 Trump: Pandemi Virus Corona Adalah Serangan Te... \n", "8 Seperti Umat Islam, Kelompok Ini Berzakat untu... \n", "9 Terapkan PSBB, Status Kota Tegal Kembali Hijau... \n", "10 3 Penumpang Positif Covid-19, Walkot Bekasi: A... \n", "11 Terpukul Covid-19, Qatar Airways Bakal PHK Peg... \n", "12 Usaha Dekorasi Pernikahan Banting Setir Buat P... \n", "13 Positif Corona, 13 Warga Desa di Polewali Mand... \n", "14 Relaksasi Kebijakan Mudik Rawan Memperlama Pen... \n", "15 Moda Transportasi Kembali Dibuka, Pandemi Covi... \n", "\n", " author date_time \\\n", "0 Muhammad Sukardi Kamis 07 Mei 2020 12:00 WIB \n", "1 Leonardus Selwyn Kangsaputra Kamis 07 Mei 2020 11:45 WIB \n", "2 Rivan Nasri Rachman Kamis 07 Mei 2020 11:15 WIB \n", "3 Mohammad Saifulloh Kamis 07 Mei 2020 12:08 WIB \n", "4 Leonardus Selwyn Kangsaputra Kamis 07 Mei 2020 11:45 WIB \n", "5 Rahman Asmardika Kamis 07 Mei 2020 10:16 WIB \n", "6 Harits Tryan Akhmad Kamis 07 Mei 2020 07:03 WIB \n", "7 Rahman Asmardika Kamis 07 Mei 2020 14:30 WIB \n", "8 Muhammad Sukardi Kamis 07 Mei 2020 14:21 WIB \n", "9 Taufik Budi Kamis 07 Mei 2020 14:14 WIB \n", "10 Wisnu Yusep Kamis 07 Mei 2020 14:10 WIB \n", "11 Kamis 07 Mei 2020 14:07 WIB Kamis 07 Mei 2020 14:07 WIB \n", "12 Putra Ramadhani Astyawan Kamis 07 Mei 2020 13:41 WIB \n", "13 Huzair Zainal Kamis 07 Mei 2020 13:41 WIB \n", "14 Giri Hartomo Kamis 07 Mei 2020 13:35 WIB \n", "15 Giri Hartomo Kamis 07 Mei 2020 13:31 WIB \n", "\n", " paragraf \n", "0 PANDEMI virus corona COVID-19 memang membuat b... \n", "1 PENELITI di Arizona State University (ASU), ba... \n", "2 JAKARTA – PBSI baru saja merayakan hari jadi m... \n", "3 RENCANA pembukaan dua Masjid Suci di Arab Saud... \n", "4 PENELITI di Arizona State University (ASU), ba... \n", "5 UMAT Buddha di berbagai belahan dunia merayaka... \n", "6 JAKARTA - Pengamat kebijakan publik, Trubus Ra... \n", "7 WASHINGTON - Presiden Amerika Serikat (AS) Don... \n", "8 Sekelompok profesional dari komunitas Dalit te... \n", "9 TEGAL - Pembatasan Sosial Berskala Besar (PSBB... \n", "10 BEKASI - Wali Kota Bekasi Rahmat Effendi menga... \n", "11 JAKARTA - Maskapai penerbangan internasional Q... \n", "12 BOGOR - Siapa sangka pembuatan peti jenazah kh... \n", "13 POLEWALI MANDAR – Sebanyak 13 orang yang beras... \n", "14 JAKARTA - Pemerintah membuka kembali transport... \n", "15 JAKARTA - Pemerintah memutuskan untuk membuka ... " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "okezone_news=get_news(url, file_name = 'okezone_tiga')\n", "okezone_news" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }