Word-Node2vec TA12.ipynb 58.7 KB
In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("men-products.csv", delimiter=',', index_col=0)

Ignore the tuple

In [2]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'percent_missing': (data.isnull().sum()/20000)*100})
missing_data
total_missing percent_missing
NAME 1 0.005
CATEGORY 0 0.000
DESCRIPTION & COLOR 0 0.000
FABRIC 4833 24.165
IMAGE 0 0.000
SIZE 3838 19.190
PRICE 0 0.000
PRODUCT ID 0 0.000
WEBSITE 0 0.000
PRODUCT URL 0 0.000
In [3]:
data.head(10)
NAME CATEGORY DESCRIPTION & COLOR FABRIC IMAGE SIZE PRICE PRODUCT ID WEBSITE PRODUCT URL
SERIAL NO
1 U.S. Polo Assn. Men Brown Genuine Leather Two ... accessories U.S. Polo Assn. Men Brown Genuine Leather Two ... Genuine leather https://assets.myntassets.com/h_1440,q_100,w_1... Height: 11.5 cm 809 1943420 Myntra https://www.myntra.com/wallets/us-polo-assn/us...
2 Baggit Men Black Solid Two Fold Wallet accessories Baggit Men Black Solid Two Fold Wallet, Baggi... PU https://assets.myntassets.com/h_1440,q_100,w_1... Height: 720 4608404 Myntra https://www.myntra.com/wallets/baggit/baggit-m...
3 HRX by Hrithik Roshan Men Grey Solid Baseball Cap accessories HRX By Hrithik Roshan Men Grey Solid Baseball ... NaN https://assets.myntassets.com/h_1440,q_100,w_1... NaN 279 2178513 Myntra https://www.myntra.com/caps/hrx-by-hrithik-ros...
4 Puma Unisex Grey Style Military Solid Baseball... accessories Puma Unisex Grey Style Military Solid Baseball... NaN https://assets.myntassets.com/h_1440,q_100,w_1... NaN 499 6699035 Myntra https://www.myntra.com/caps/puma/puma-unisex-g...
5 FabSeasons Beige Solid Scarf accessories FabSeasons Beige Solid Scarf, FabSeasons, Scar... Acrylic https://assets.myntassets.com/h_1440,q_100,w_1... Length:0.9 m 449 2439658 Myntra https://www.myntra.com/scarves/fabseasons/fabs...
6 Ed Hardy Men Black Embellished Belt accessories Ed Hardy Men Black Embellished Belt, Ed Hardy... Leather https://assets.myntassets.com/h_1440,q_100,w_1... Width: 3.7 cm 1199 2238752 Myntra https://www.myntra.com/belts/ed-hardy/ed-hardy...
7 Roadster Men Tan Brown Leather Belt accessories Roadster Men Tan Brown Leather Belt, Roadster,... Leather https://assets.myntassets.com/h_1440,q_100,w_1... Width: 4 cm 419 2975974 Myntra https://www.myntra.com/belts/roadster/roadster...
8 Peora Silver-Toned Rhodium-Plated Stone-Studde... accessories Peora Silver Toned Rhodium Plated Stone Studde... NaN https://assets.myntassets.com/h_1440,q_100,w_1... NaN 551 3006095 Myntra https://www.myntra.com/ring/peora/peora-silver...
9 Royal Enfield Unisex White Urban Trooper Helme... accessories Royal Enfield Unisex White Urban Trooper Helme... NaN https://assets.myntassets.com/h_1440,q_100,w_1... NaN 3500 2242802 Myntra https://www.myntra.com/helmets/royal-enfield/r...
10 BuckleUp Men Black Leather Belt accessories BuckleUp Men Black Leather Belt, BuckleUp, Bel... Leather https://assets.myntassets.com/h_1440,q_100,w_1... Width: 3.5 cm 517 1734718 Myntra https://www.myntra.com/belts/buckleup/buckleup...
In [4]:
data.duplicated().sum()
Out [4]:
0

Delete Columns

In [5]:
del data['FABRIC']
del data['IMAGE']
del data['SIZE']
del data['WEBSITE']
del data['PRODUCT URL']
del data['PRICE']
del data['PRODUCT ID']

Rename Columns

In [6]:
data.rename(columns = {'DESCRIPTION & COLOR':'DESCRIPTION'}, inplace = True) 
Karena Data dalam jumlah besar maka diambil sampel 10.000 data
In [7]:
data1 = data.sample(10000, random_state=1).copy()
In [8]:
# Rename kategori produk
data1.replace({'CATEGORY': 
             {'accessories': 'Accesories', 
              'casual-shirts': 'Casual Shirts',
              'Men-Casual-Trousers': 'Men Casual Trousers',
              'formal-shirts': 'Formal Shirts',
              'Men-Formal-Trousers': 'Men Formal Trousers',
              'men-jackets-coats': 'Men Jackets Coats',
              'men-swimwear': 'Men Swimwear',
              'men-suits': 'Men Suits'}}, 
              inplace= True)# Punctuation Removal
In [9]:
data1.index = range(10000)
data1['NAME'].apply(lambda x: len(x.split(' '))).sum()
Out [9]:
88348

Punctuation Removal

In [10]:
data1['NAME'] = data1['NAME'].str.replace('[^\w\s]','')
# Hasil Punctuation Removal
print(data1["NAME"].head(50))
Out [10]:
0              Fort Collins Men Red Solid Padded Jacket
1     MANGO MAN Men Navy Blue Tailored Slim Fit Soli...
2     Arrow Men Navy Blue Tapered Fit Checked Formal...
3                    Hanes Charcoal Grey Thermal TShirt
4     Hancock Men Blue Regular Fit Striped Formal Shirt
5            Tantra Men Black Printed Round Neck Tshirt
6     Aeropostale Men Blue Regular Fit MidRise Mildl...
7     ether Men Navy Blue Slim Fit Anti Microbial Co...
8     Roadster Men White Regular Fit MidRise Clean L...
9            Dollar Bigboss Pack of 3 Trunks MDTR03PO34
10     Moda Rapido Men Black Printed Polo Collar Tshirt
11    Louis Philippe Men Grey Regular Fit Self Desig...
12                 Light Blue Mid Rise Skinny Fit Jeans
13    HIGHLANDER Men Olive Green Slim Fit Camouflage...
14    US Polo Assn Denim Co Men White  Blue Slim Fit...
15      Levis Men Navy Blue Slim Fit Solid Casual Shirt
16    Louis Philippe Sport Men Charcoal Grey Solid T...
17    Killer Men Blue Regular Fit MidRise Clean Look...
18    Peter England Casuals Men Grey Slim Fit Solid ...
19     Arrow Men Grey Tapered Fit Solid Formal Trousers
20    V Dot Men Grey Slim Fit Self Design Formal Tro...
21            GESPO Men White Printed Round Neck Tshirt
22    SMAG Men Mustard Solid Lightweight Tailored Ja...
23    Jack  Jones Men Black Slim Fit Solid Regular T...
24    Van Heusen Men Blue Regular Fit Solid Formal S...
25                   Maniac Men Grey Solid VNeck Tshirt
26    HERENOW Men Blue Slim Fit MidRise Clean Look S...
27    Blackberrys Men Navy Blue Printed Casual Trousers
28    Moda Rapido Men White Printed Round Neck Longl...
29        Fort Collins Men Tan Brown Solid Biker Jacket
30       Fort Collins Men Rust Brown Solid Biker Jacket
31    ESPRIT Men Navy  OffWhite Striped Round Neck T...
32    US Polo Assn Men Olive Green Regular Fit Solid...
33            Jockey Men Navy Blue Striped VNeck Tshirt
34    Indian Terrain Men Rust Red Solid Polo Collar ...
35            Pacific Gold Men Black Accessory Gift Set
36                    WROGN Men Navy Solid Biker Jacket
37    LOCOMOTIVE Men Blue Slim Fit MidRise Clean Loo...
38                           Blue Washed Slim Fit Jeans
39        LOCOMOTIVE Men Rust Printed Round Neck Tshirt
40    WROGN Men Olive Green Colourblocked Round Neck...
41              Killer Men Red Solid Polo Collar Tshirt
42    Van Heusen Men Blue Contemporary Regular Fit C...
43      Van Heusen Men Blue Slim Fit Solid Casual Shirt
44      Cottonworld Men Black Printed Round Neck Tshirt
45    Reebok Men Blue Athletic Graphic Printed Round...
46    Moda Rapido Men Black  Olive Green Colourblock...
47                    2GO Men Black Printed Polo Tshirt
48    Roadster Men Navy Blue Colourblocked Round Nec...
49    Knotyy Men Grey Colourblocked SelfDesign Beani...
Name: NAME, dtype: object

Case Folding (Convert string to lower)

In [11]:
# mengubah ke huruf kecil
data1['NAME'] = data1['NAME'].str.lower()
print(data1['NAME'].head(5))
Out [11]:
0             fort collins men red solid padded jacket
1    mango man men navy blue tailored slim fit soli...
2    arrow men navy blue tapered fit checked formal...
3                   hanes charcoal grey thermal tshirt
4    hancock men blue regular fit striped formal shirt
Name: NAME, dtype: object

Remove Stopwords and Stemming

In [12]:
import re
from nltk.corpus import stopwords
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stopword_set]
    
    #stemmed words
    ps = PorterStemmer()
    stemmed_words = [ps.stem(word) for word in meaningful_words]
    
    #join the cleaned words in a list
    cleaned_word_list = " ".join(stemmed_words)
    
    return cleaned_word_list
In [13]:
data1['NAME'] = data1['NAME'].apply(lambda line : preprocess(line))
In [14]:
data1.head()
NAME CATEGORY DESCRIPTION
0 fort collin men red solid pad jacket Men Jackets Coats Fort Collins Men Red Solid Padded Jacket, For...
1 mango man men navi blue tailor slim fit solid ... Men Formal Trousers MANGO MAN Men Navy Blue Tailored Slim Fit Soli...
2 arrow men navi blue taper fit check formal tro... Men Formal Trousers Arrow Men Navy Blue Tapered Fit Checked Formal...
3 hane charcoal grey thermal tshirt Innerwear & Sleapwear Hanes Charcoal Grey Thermal T Shirt, Hanes, T...
4 hancock men blue regular fit stripe formal shirt Formal Shirts Hancock Men Blue Regular Fit Striped Formal Sh...
In [15]:
import networkx as nx
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline
Gabungan data pada CATEGORY dan NAME menjadi vocabulary
Membangun graph menggunakan vocabulary
In [16]:
G = nx.from_pandas_edgelist(data1, "CATEGORY", "NAME", edge_attr=True, create_using=nx.Graph())
Memeriksa jumlah node dalam graph
In [17]:
len(G)
Out [17]:
8125
Kami mendefinisikan fungsi yang akan mengambil node dan panjang path yang dilalui sebagai input. Fungsi akan berjalan melalui node yang terhubung dari input node yang ditentukan random walk. Lalu fungsi akan mengembalikan urutan node yang dilalui.
In [18]:
def get_randomwalk(node, path_length):
    
    random_walk = [node]
    
    for i in range(path_length-1):
        temp = list(G.neighbors(node))
        temp = list(set(temp) - set(random_walk))    
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node
        
    return random_walk
Contoh fungsi untuk: Men Formal Trousers
In [19]:
get_randomwalk('Men Formal Trousers', 10)
Out [19]:
['Men Formal Trousers', 'invictu men black slim fit solid formal trouser']
Kami menentukan panjang path untuk dilintasi dengan nilai 10. Kami akan menangkap random walk untuk semua node dalam dataset kami.
In [20]:
# get list of all nodes from the graph
all_nodes = list(G.nodes())

random_walks = []
for n in tqdm(all_nodes):
    for i in range(5):
        random_walks.append(get_randomwalk(n,10))
        
# count of sequences
len(random_walks)
Out [20]:
100%|████████████████████████████████████████████████████████████████████████████| 8125/8125 [00:06<00:00, 1289.47it/s]
Dengan panjang path yang kami atur dengan nilai 10, maka didapatkan 40.625 urutan random walk. Urutan ini dapat digunakan sebagai input ke model skip-gram dan mengekstraksi bobot yang dipelajari oleh model (node embedding).
In [21]:
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')
Lalu kami melatih model skip-gram dengan random walk.
In [22]:
# train skip-gram (word2vec) model
model = Word2Vec(window = 4, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(random_walks, progress_per=2)

model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)
Out [22]:
(1778978, 2439200)
Setiap node dalam graph diwakili oleh vektor dengan panjang tetap (100). Sebagai contoh kita cari paling mirip dengan: "Formal Shirts".
In [23]:
model.similar_by_word('Formal Shirts')
Out [23]:
[('peter england men grey solid slim fit formal shirt', 0.8732825517654419),
 ('peter england men orang slim fit solid formal shirt', 0.8653634786605835),
 ('arrow new york men blue white slim fit check formal shirt',
  0.864309549331665),
 ('van heusen men creamcolour regular fit solid formal shirt',
  0.8582490682601929),
 ('van heusen men brown purpl slim fit selfdesign formal shirt',
  0.8579122424125671),
 ('invictu men blue slim fit print formal shirt', 0.8563601970672607),
 ('red tape men black regular fit solid formal shirt', 0.8562281131744385),
 ('rg design men blue slim fit stripe linen formal shirt', 0.8559995293617249),
 ('van heusen men lavend slim fit check formal shirt', 0.8549967408180237),
 ('jainish men orang classic slim fit solid formal shirt', 0.8544467091560364)]
Contoh kita cari paling mirip dengan: "Accesories"
In [24]:
model.similar_by_word('Accesories')
Out [24]:
[('classic cl icon tape black cap', 0.8467520475387573),
 ('tossido grey check pattern tie', 0.8429974317550659),
 ('tommi hilfig men brown solid belt', 0.8407713174819946),
 ('tommi hilfig men navi blue brown revers solid leather belt',
  0.8403265476226807),
 ('lino perro black solid broad tie', 0.8362069129943848),
 ('loui philipp men navi blue brown solid revers leather belt',
  0.8343643546104431),
 ('hrx hrithik roshan unisex charcoal grey print beani', 0.8334780931472778),
 ('knotyy black solid unisex beani', 0.8293745517730713),
 ('invictu blue coffe brown check tie', 0.8290094137191772),
 ('scharf men brown solid leather belt', 0.8253196477890015)]
In [25]:
terms = ['Formal Shirts', 'Accesories', 
         'Casual Shirts','Men Casual Trousers', 'Men Formal Trousers', 
         'Men Jackets Coats','Men Swimwear', 'Men Suits']
In [26]:
def plot_nodes(word_list):
    X = model[word_list]
    
    # reduce dimensions to 2
    pca = PCA(n_components=2)
    result = pca.fit_transform(X)
    
    
    plt.figure(figsize=(12,9))
    # create a scatter plot of the projection
    plt.scatter(result[:, 0], result[:, 1])
    for i, word in enumerate(word_list):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))
        
    plt.show()
In [27]:
plot_nodes(terms)