Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
news
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Sartika Aritonang
news
Commits
033fb2ae
Commit
033fb2ae
authored
4 years ago
by
Sartika Aritonang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload proximity.py
parent
97bf6435
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
287 additions
and
0 deletions
+287
-0
proximity.py
project/news_site/proximity/proximity.py
+287
-0
No files found.
project/news_site/proximity/proximity.py
0 → 100644
View file @
033fb2ae
import
re
import
math
import
string
import
xml.dom.minidom
as
minidom
from
Sastrawi.Stemmer.StemmerFactory
import
StemmerFactory
from
nltk.tokenize
import
word_tokenize
from
sklearn.feature_extraction.text
import
CountVectorizer
def
parse_xml
():
news_collection
=
minidom
.
parse
(
"data/news.xml"
)
news_id
=
news_collection
.
getElementsByTagName
(
'ID'
)
news_source
=
news_collection
.
getElementsByTagName
(
'SOURCE'
)
news_link
=
news_collection
.
getElementsByTagName
(
'LINK'
)
news_title
=
news_collection
.
getElementsByTagName
(
'TITLE'
)
news_author
=
news_collection
.
getElementsByTagName
(
'AUTHOR'
)
news_datetime
=
news_collection
.
getElementsByTagName
(
'DATETIME'
)
news_paragraph
=
news_collection
.
getElementsByTagName
(
'PARAGRAPH'
)
N_news
=
len
(
news_id
)
id_in_news
=
[]
sentence_in_source
=
[]
sentence_in_link
=
[]
sentence_in_title
=
[]
sentence_in_author
=
[]
sentence_in_datetime
=
[]
sentence_in_news
=
[]
for
i
in
range
(
N_news
):
ids
=
news_id
[
i
]
.
firstChild
.
data
id_in_news
.
append
(
ids
)
for
i
in
range
(
N_news
):
sentences
=
news_source
[
i
]
.
firstChild
.
data
sentence_in_source
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_link
[
i
]
.
firstChild
.
data
sentence_in_link
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_title
[
i
]
.
firstChild
.
data
sentence_in_title
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_author
[
i
]
.
firstChild
.
data
sentence_in_author
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_datetime
[
i
]
.
firstChild
.
data
sentence_in_datetime
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_paragraph
[
i
]
.
firstChild
.
data
sentence_in_news
.
append
(
sentences
)
return
({
'id_in_news'
:
id_in_news
,
'sentence_in_source'
:
sentence_in_source
,
'sentence_in_link'
:
sentence_in_link
,
'sentence_in_title'
:
sentence_in_title
,
'sentence_in_author'
:
sentence_in_author
,
'sentence_in_datetime'
:
sentence_in_datetime
,
'sentence_in_news'
:
sentence_in_news
})
def
removePunctuation
(
textList
):
for
i
in
range
(
len
(
textList
)):
for
punct
in
string
.
punctuation
:
textList
[
i
]
=
textList
[
i
]
.
replace
(
punct
,
" "
)
textList
[
i
]
=
re
.
sub
(
r'^https?:\/\/.*[\r\n]*'
,
''
,
textList
[
i
],
flags
=
re
.
MULTILINE
)
textList
[
i
]
=
re
.
sub
(
r'“'
,
''
,
textList
[
i
])
textList
[
i
]
=
re
.
sub
(
r'”'
,
''
,
textList
[
i
])
return
textList
def
token
(
sentence
):
token
=
[]
for
word
in
CountVectorizer
()
.
build_tokenizer
()(
sentence
):
token
.
append
(
word
)
return
token
def
tokenize
(
textList
):
tokens
=
[]
for
i
in
range
(
len
(
textList
)):
tokens
.
append
(
token
(
textList
[
i
]))
return
tokens
def
caseFolding
(
textList
):
text
=
[]
for
i
in
range
(
len
(
textList
)):
text
.
append
(
textList
[
i
]
.
lower
())
return
text
def
get_token
(
file
):
#file = parse_xml()
content
=
removePunctuation
(
file
[
'sentence_in_news'
])
title
=
removePunctuation
(
file
[
'sentence_in_title'
])
contents
=
caseFolding
(
content
)
titles
=
caseFolding
(
title
)
token_contents
=
tokenize
(
contents
)
token_titles
=
tokenize
(
titles
)
token
=
[]
for
i
in
token_titles
:
token
.
append
(
i
)
for
j
in
token_contents
:
token
.
append
(
j
)
return
token
def
checkStopword
(
sentence
,
stop_words
):
sentence
=
[
w
for
w
in
sentence
if
not
w
in
stop_words
]
return
sentence
def
stopwordRemove
(
textList
):
with
open
(
"data/id.stopwords.02.01.2016.txt"
,
"r"
)
as
fd
:
stopwords
=
fd
.
read
()
.
splitlines
()
stop_words
=
set
(
stopwords
)
text
=
[]
for
i
in
range
(
len
(
textList
)):
text
.
append
(
checkStopword
(
textList
[
i
],
stop_words
))
return
text
def
numberRemove
(
textList
):
text
=
[]
for
i
in
range
(
len
(
textList
)):
text
.
append
([
w
for
w
in
textList
[
i
]
if
not
any
(
j
.
isdigit
()
for
j
in
w
)])
return
text
def
stemming
(
textList
):
factory
=
StemmerFactory
()
stemmer
=
factory
.
create_stemmer
()
text
=
textList
for
i
in
range
(
len
(
textList
)):
for
j
in
range
(
len
(
textList
[
i
])):
text
[
i
][
j
]
=
stemmer
.
stem
(
text
[
i
][
j
])
return
text
def
getAllTerms
(
textList
):
terms
=
[]
for
i
in
range
(
len
(
textList
)):
for
j
in
range
(
len
(
textList
[
i
])):
terms
.
append
(
textList
[
i
][
j
])
return
sorted
(
set
(
terms
))
def
createIndex
(
textList
):
#file = parse_xml()
#token = get_token()
#tokenize = stopwordRemove(token)
#tokenize = numberRemove(tokenize)
#textList = stemming(tokenize)
terms
=
getAllTerms
(
textList
)
proximity
=
{}
for
term
in
terms
:
position
=
{}
for
n
in
range
(
len
(
textList
)):
if
(
term
in
textList
[
n
]):
position
[(
file
[
'id_in_news'
]
*
2
)[
n
]]
=
[]
for
i
in
range
(
len
(
textList
[
n
])):
if
(
term
==
textList
[
n
][
i
]):
position
[(
file
[
'id_in_news'
]
*
2
)[
n
]]
.
append
(
i
)
proximity
[
term
]
=
position
return
proximity
def
save_indexing
():
indexing
=
createIndex
()
file
=
open
(
'index.txt'
,
'w'
)
file
.
write
(
str
(
indexing
))
file
.
close
()
# save_indexing()
def
open_indexing
():
with
open
(
"data/index.txt"
,
"r"
)
as
fd
:
fi
=
fd
.
read
()
index
=
eval
(
fi
)
return
index
def
removePunctuationQuery
(
textList
):
punctuations
=
'''!()-[]{};:'"
\
,<>./?@#$
%
^&*_~'''
for
x
in
textList
:
if
x
in
punctuations
:
textList
=
textList
.
replace
(
x
,
""
)
return
textList
def
queryPreprocessing
(
query
):
terms
=
[]
query
=
removePunctuationQuery
(
query
)
querys
=
[]
querys
.
append
(
query
)
#querys = caseFolding(querys)
for
i
in
range
(
len
(
querys
)):
querys
[
i
]
=
''
.
join
([
i
for
i
in
querys
[
i
]
if
not
i
.
isdigit
()])
querys
[
i
]
=
re
.
sub
(
r'^https?:\/\/.*[\r\n]*'
,
''
,
querys
[
i
],
flags
=
re
.
MULTILINE
)
terms
.
append
(
word_tokenize
(
querys
[
i
]))
terms
=
numberRemove
(
terms
)
terms
=
stopwordRemove
(
terms
)
terms
=
stemming
(
terms
)
return
terms
def
queryInIndex
(
query
,
index
):
result
=
[]
for
word
in
query
:
if
word
in
index
:
result
.
append
(
word
)
return
result
def
df
(
query
,
index
):
docFreq
=
{}
for
word
in
query
:
if
word
in
index
:
docFreq
[
word
]
=
len
(
index
[
word
])
return
docFreq
def
idf
(
df
,
N
):
inv
=
{}
for
word
in
df
:
inv
[
word
]
=
math
.
log10
(
N
/
df
[
word
])
return
inv
def
tf
(
query
,
index
):
termFreq
=
{}
for
word
in
query
:
freq
=
{}
if
word
in
index
:
for
i
in
index
[
word
]:
freq
[
i
]
=
len
(
index
[
word
][
i
])
termFreq
[
word
]
=
freq
return
termFreq
def
tfidf
(
tf
,
idf
):
w
=
{}
for
word
in
tf
:
wtd
=
{}
for
doc
in
tf
[
word
]:
wtd
[
doc
]
=
(
1
+
(
math
.
log10
(
tf
[
word
][
doc
])))
*
idf
[
word
]
w
[
word
]
=
wtd
return
w
def
score
(
TFIDF
):
res
=
{}
for
i
in
TFIDF
:
for
j
in
TFIDF
[
i
]:
res
[
j
]
=
0
for
i
in
TFIDF
:
for
j
in
TFIDF
[
i
]:
res
[
j
]
=
res
[
j
]
+
TFIDF
[
i
][
j
]
sorted_dict
=
sorted
(
res
,
key
=
res
.
get
,
reverse
=
True
)
return
({
'sorted_dict'
:
sorted_dict
,
'res'
:
res
})
def
results
(
query
):
querys
=
[]
querys
.
append
(
query
)
file
=
parse_xml
()
with
open
(
"data/index.txt"
,
"r"
)
as
fd
:
fi
=
fd
.
read
()
index
=
eval
(
fi
)
terms
=
queryPreprocessing
(
querys
)
querys
=
terms
[
0
]
querys
=
queryInIndex
(
querys
,
index
)
N
=
len
(
file
[
'id_in_news'
])
tfidf_list
=
[]
docFrequency
=
df
(
querys
,
index
)
invDocFrequency
=
idf
(
docFrequency
,
N
)
termFrequency
=
tf
(
querys
,
index
)
TFIDF
=
tfidf
(
termFrequency
,
invDocFrequency
)
sc
=
score
(
TFIDF
)
relevanceDocNumber
=
[]
count
=
0
result
=
[]
process
=
[]
for
i
in
range
(
len
(
sc
[
'sorted_dict'
])):
relevanceDocNumber
.
append
(
int
(
sc
[
'sorted_dict'
][
i
]))
a
=
file
[
'id_in_news'
]
.
index
(
sc
[
'sorted_dict'
][
i
])
rank
=
i
+
1
doc_score
=
sc
[
'res'
][
sc
[
'sorted_dict'
][
i
]]
doc_id
=
sc
[
'sorted_dict'
][
i
]
doc_source
=
file
[
'sentence_in_source'
][
a
][:]
doc_link
=
file
[
'sentence_in_link'
][
a
][:]
doc_title
=
file
[
'sentence_in_title'
][
a
][:]
doc_author
=
file
[
'sentence_in_author'
][
a
][:]
doc_datetime
=
file
[
'sentence_in_datetime'
][
a
][:]
doc_contents
=
file
[
'sentence_in_news'
][
a
][
0
:
400
]
+
'..........'
result
.
append
({
'doc_score'
:
doc_score
,
'doc_id'
:
doc_id
,
'doc_source'
:
doc_source
,
'doc_link'
:
doc_link
,
'doc_title'
:
doc_title
,
'doc_author'
:
doc_author
,
'doc_datetime'
:
doc_datetime
,
'doc_contents'
:
doc_contents
})
process
.
append
({
'terms'
:
terms
,
'TFIDF'
:
TFIDF
,
'docFrequency'
:
docFrequency
,
'invDocFrequency'
:
invDocFrequency
,
'termFrequency'
:
termFrequency
})
return
({
'result'
:
result
,
'process'
:
process
})
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment