Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
STBI
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Febby B. Simanjuntak
STBI
Commits
f27cbbb1
Commit
f27cbbb1
authored
4 years ago
by
Febby Simanjuntak
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
inverted done
parent
3e9ccf70
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
84 additions
and
56 deletions
+84
-56
STBI_Project-checkpoint.ipynb
.ipynb_checkpoints/STBI_Project-checkpoint.ipynb
+42
-28
STBI_Project.ipynb
STBI_Project.ipynb
+42
-28
No files found.
.ipynb_checkpoints/STBI_Project-checkpoint.ipynb
View file @
f27cbbb1
...
...
@@ -632,64 +632,78 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
6
,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(row):\n",
" if row is None or row is '':\n",
" tokens = \"\"\n",
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
"def tokenize(text):\n",
" words = word_tokenize(text)\n",
" return words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###
Regular expressions to remove unnecessary characters
"
"###
Normalization
"
]
},
{
"cell_type": "code",
"execution_count":
4
,
"execution_count":
29
,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def to_lowercase(data):\n",
" new_word = []\n",
" for word in data.columns:\n",
" word = word.lower()\n",
" new_word.append(word)\n",
" return new_word\n",
"\n",
"def remove_stopwords(data):\n",
" for col in data.columns:\n",
" if col in stopwords.words('english'):\n",
" data = data.drop(columns = col)\n",
" return data;\n",
"\n",
"def reg_expressions(row):\n",
" tokens = []\n",
" try:\n",
" for token in row:\n",
" token = token.lower() # make all characters lower case\n",
" token = re.sub(r'[\\W\\d]', \"\", token)\n",
" token = token[:maxtokenlen] # truncate token\n",
" tokens.append(token)\n",
" except:\n",
" token = \"\"\n",
" tokens.append(token)\n",
" return tokens"
"def normalize():\n",
" words = to_lowercase(df)\n",
" data = remove_stopwords(df)\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"norm = normalize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###
Stop-word removal
"
"###
Inverted Index
"
]
},
{
"cell_type": "code",
"execution_count":
5
,
"execution_count":
31
,
"metadata": {},
"outputs": [],
"source": [
"def stop_word_removal(row):\n",
" token = [token for token in row if token not in stopwords]\n",
" token = filter(None, token)\n",
" return token"
"\n",
"def create_Inverted_index(all_unique_documents):\n",
" inverted_index = {}\n",
" for doc_id in range(len(all_unique_documents)):\n",
" for term in all_unique_documents[doc_id]:\n",
" if term not in inverted_index:\n",
" inverted_index[term] = []\n",
" inverted_index[term].append(doc_id) \n",
" return inverted_index"
]
},
{
...
...
This diff is collapsed.
Click to expand it.
STBI_Project.ipynb
View file @
f27cbbb1
...
...
@@ -632,64 +632,78 @@
},
{
"cell_type": "code",
"execution_count":
3
,
"execution_count":
6
,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(row):\n",
" if row is None or row is '':\n",
" tokens = \"\"\n",
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
"def tokenize(text):\n",
" words = word_tokenize(text)\n",
" return words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###
Regular expressions to remove unnecessary characters
"
"###
Normalization
"
]
},
{
"cell_type": "code",
"execution_count":
4
,
"execution_count":
29
,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def to_lowercase(data):\n",
" new_word = []\n",
" for word in data.columns:\n",
" word = word.lower()\n",
" new_word.append(word)\n",
" return new_word\n",
"\n",
"def remove_stopwords(data):\n",
" for col in data.columns:\n",
" if col in stopwords.words('english'):\n",
" data = data.drop(columns = col)\n",
" return data;\n",
"\n",
"def reg_expressions(row):\n",
" tokens = []\n",
" try:\n",
" for token in row:\n",
" token = token.lower() # make all characters lower case\n",
" token = re.sub(r'[\\W\\d]', \"\", token)\n",
" token = token[:maxtokenlen] # truncate token\n",
" tokens.append(token)\n",
" except:\n",
" token = \"\"\n",
" tokens.append(token)\n",
" return tokens"
"def normalize():\n",
" words = to_lowercase(df)\n",
" data = remove_stopwords(df)\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"norm = normalize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###
Stop-word removal
"
"###
Inverted Index
"
]
},
{
"cell_type": "code",
"execution_count":
5
,
"execution_count":
31
,
"metadata": {},
"outputs": [],
"source": [
"def stop_word_removal(row):\n",
" token = [token for token in row if token not in stopwords]\n",
" token = filter(None, token)\n",
" return token"
"\n",
"def create_Inverted_index(all_unique_documents):\n",
" inverted_index = {}\n",
" for doc_id in range(len(all_unique_documents)):\n",
" for term in all_unique_documents[doc_id]:\n",
" if term not in inverted_index:\n",
" inverted_index[term] = []\n",
" inverted_index[term].append(doc_id) \n",
" return inverted_index"
]
},
{
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment