In [1]:
import pandas as pd import numpy as np data = pd.read_csv("men-products.csv", delimiter=',', index_col=0)
Ignore the tuple
In [2]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'percent_missing': (data.isnull().sum()/20000)*100}) missing_data
total_missing | percent_missing | |
---|---|---|
NAME | 1 | 0.005 |
CATEGORY | 0 | 0.000 |
DESCRIPTION & COLOR | 0 | 0.000 |
FABRIC | 4833 | 24.165 |
IMAGE | 0 | 0.000 |
SIZE | 3838 | 19.190 |
PRICE | 0 | 0.000 |
PRODUCT ID | 0 | 0.000 |
WEBSITE | 0 | 0.000 |
PRODUCT URL | 0 | 0.000 |
In [3]:
data.head(10)
NAME | CATEGORY | DESCRIPTION & COLOR | FABRIC | IMAGE | SIZE | PRICE | PRODUCT ID | WEBSITE | PRODUCT URL | |
---|---|---|---|---|---|---|---|---|---|---|
SERIAL NO | ||||||||||
1 | U.S. Polo Assn. Men Brown Genuine Leather Two ... | accessories | U.S. Polo Assn. Men Brown Genuine Leather Two ... | Genuine leather | https://assets.myntassets.com/h_1440,q_100,w_1... | Height: 11.5 cm | 809 | 1943420 | Myntra | https://www.myntra.com/wallets/us-polo-assn/us... |
2 | Baggit Men Black Solid Two Fold Wallet | accessories | Baggit Men Black Solid Two Fold Wallet, Baggi... | PU | https://assets.myntassets.com/h_1440,q_100,w_1... | Height: | 720 | 4608404 | Myntra | https://www.myntra.com/wallets/baggit/baggit-m... |
3 | HRX by Hrithik Roshan Men Grey Solid Baseball Cap | accessories | HRX By Hrithik Roshan Men Grey Solid Baseball ... | NaN | https://assets.myntassets.com/h_1440,q_100,w_1... | NaN | 279 | 2178513 | Myntra | https://www.myntra.com/caps/hrx-by-hrithik-ros... |
4 | Puma Unisex Grey Style Military Solid Baseball... | accessories | Puma Unisex Grey Style Military Solid Baseball... | NaN | https://assets.myntassets.com/h_1440,q_100,w_1... | NaN | 499 | 6699035 | Myntra | https://www.myntra.com/caps/puma/puma-unisex-g... |
5 | FabSeasons Beige Solid Scarf | accessories | FabSeasons Beige Solid Scarf, FabSeasons, Scar... | Acrylic | https://assets.myntassets.com/h_1440,q_100,w_1... | Length:0.9 m | 449 | 2439658 | Myntra | https://www.myntra.com/scarves/fabseasons/fabs... |
6 | Ed Hardy Men Black Embellished Belt | accessories | Ed Hardy Men Black Embellished Belt, Ed Hardy... | Leather | https://assets.myntassets.com/h_1440,q_100,w_1... | Width: 3.7 cm | 1199 | 2238752 | Myntra | https://www.myntra.com/belts/ed-hardy/ed-hardy... |
7 | Roadster Men Tan Brown Leather Belt | accessories | Roadster Men Tan Brown Leather Belt, Roadster,... | Leather | https://assets.myntassets.com/h_1440,q_100,w_1... | Width: 4 cm | 419 | 2975974 | Myntra | https://www.myntra.com/belts/roadster/roadster... |
8 | Peora Silver-Toned Rhodium-Plated Stone-Studde... | accessories | Peora Silver Toned Rhodium Plated Stone Studde... | NaN | https://assets.myntassets.com/h_1440,q_100,w_1... | NaN | 551 | 3006095 | Myntra | https://www.myntra.com/ring/peora/peora-silver... |
9 | Royal Enfield Unisex White Urban Trooper Helme... | accessories | Royal Enfield Unisex White Urban Trooper Helme... | NaN | https://assets.myntassets.com/h_1440,q_100,w_1... | NaN | 3500 | 2242802 | Myntra | https://www.myntra.com/helmets/royal-enfield/r... |
10 | BuckleUp Men Black Leather Belt | accessories | BuckleUp Men Black Leather Belt, BuckleUp, Bel... | Leather | https://assets.myntassets.com/h_1440,q_100,w_1... | Width: 3.5 cm | 517 | 1734718 | Myntra | https://www.myntra.com/belts/buckleup/buckleup... |
In [4]:
data.duplicated().sum()
Out [4]:
0
Delete Columns
In [5]:
del data['FABRIC'] del data['IMAGE'] del data['SIZE'] del data['WEBSITE'] del data['PRODUCT URL'] del data['PRICE'] del data['PRODUCT ID']
Rename Columns
In [6]:
data.rename(columns = {'DESCRIPTION & COLOR':'DESCRIPTION'}, inplace = True)
Karena Data dalam jumlah besar maka diambil sampel 10.000 data
In [7]:
data1 = data.sample(10000, random_state=1).copy()
In [8]:
# Rename kategori produk data1.replace({'CATEGORY': {'accessories': 'Accesories', 'casual-shirts': 'Casual Shirts', 'Men-Casual-Trousers': 'Men Casual Trousers', 'formal-shirts': 'Formal Shirts', 'Men-Formal-Trousers': 'Men Formal Trousers', 'men-jackets-coats': 'Men Jackets Coats', 'men-swimwear': 'Men Swimwear', 'men-suits': 'Men Suits'}}, inplace= True)# Punctuation Removal
In [9]:
data1.index = range(10000) data1['NAME'].apply(lambda x: len(x.split(' '))).sum()
Out [9]:
88348
Punctuation Removal
In [10]:
data1['NAME'] = data1['NAME'].str.replace('[^\w\s]','') # Hasil Punctuation Removal print(data1["NAME"].head(50))
Out [10]:
0 Fort Collins Men Red Solid Padded Jacket 1 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... 2 Arrow Men Navy Blue Tapered Fit Checked Formal... 3 Hanes Charcoal Grey Thermal TShirt 4 Hancock Men Blue Regular Fit Striped Formal Shirt 5 Tantra Men Black Printed Round Neck Tshirt 6 Aeropostale Men Blue Regular Fit MidRise Mildl... 7 ether Men Navy Blue Slim Fit Anti Microbial Co... 8 Roadster Men White Regular Fit MidRise Clean L... 9 Dollar Bigboss Pack of 3 Trunks MDTR03PO34 10 Moda Rapido Men Black Printed Polo Collar Tshirt 11 Louis Philippe Men Grey Regular Fit Self Desig... 12 Light Blue Mid Rise Skinny Fit Jeans 13 HIGHLANDER Men Olive Green Slim Fit Camouflage... 14 US Polo Assn Denim Co Men White Blue Slim Fit... 15 Levis Men Navy Blue Slim Fit Solid Casual Shirt 16 Louis Philippe Sport Men Charcoal Grey Solid T... 17 Killer Men Blue Regular Fit MidRise Clean Look... 18 Peter England Casuals Men Grey Slim Fit Solid ... 19 Arrow Men Grey Tapered Fit Solid Formal Trousers 20 V Dot Men Grey Slim Fit Self Design Formal Tro... 21 GESPO Men White Printed Round Neck Tshirt 22 SMAG Men Mustard Solid Lightweight Tailored Ja... 23 Jack Jones Men Black Slim Fit Solid Regular T... 24 Van Heusen Men Blue Regular Fit Solid Formal S... 25 Maniac Men Grey Solid VNeck Tshirt 26 HERENOW Men Blue Slim Fit MidRise Clean Look S... 27 Blackberrys Men Navy Blue Printed Casual Trousers 28 Moda Rapido Men White Printed Round Neck Longl... 29 Fort Collins Men Tan Brown Solid Biker Jacket 30 Fort Collins Men Rust Brown Solid Biker Jacket 31 ESPRIT Men Navy OffWhite Striped Round Neck T... 32 US Polo Assn Men Olive Green Regular Fit Solid... 33 Jockey Men Navy Blue Striped VNeck Tshirt 34 Indian Terrain Men Rust Red Solid Polo Collar ... 35 Pacific Gold Men Black Accessory Gift Set 36 WROGN Men Navy Solid Biker Jacket 37 LOCOMOTIVE Men Blue Slim Fit MidRise Clean Loo... 38 Blue Washed Slim Fit Jeans 39 LOCOMOTIVE Men Rust Printed Round Neck Tshirt 40 WROGN Men Olive Green Colourblocked Round Neck... 41 Killer Men Red Solid Polo Collar Tshirt 42 Van Heusen Men Blue Contemporary Regular Fit C... 43 Van Heusen Men Blue Slim Fit Solid Casual Shirt 44 Cottonworld Men Black Printed Round Neck Tshirt 45 Reebok Men Blue Athletic Graphic Printed Round... 46 Moda Rapido Men Black Olive Green Colourblock... 47 2GO Men Black Printed Polo Tshirt 48 Roadster Men Navy Blue Colourblocked Round Nec... 49 Knotyy Men Grey Colourblocked SelfDesign Beani... Name: NAME, dtype: object
Case Folding (Convert string to lower)
In [11]:
# mengubah ke huruf kecil data1['NAME'] = data1['NAME'].str.lower() print(data1['NAME'].head(5))
Out [11]:
0 fort collins men red solid padded jacket 1 mango man men navy blue tailored slim fit soli... 2 arrow men navy blue tapered fit checked formal... 3 hanes charcoal grey thermal tshirt 4 hancock men blue regular fit striped formal shirt Name: NAME, dtype: object
Remove Stopwords and Stemming
In [12]:
import re from nltk.corpus import stopwords import pandas as pd from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize, word_tokenize def preprocess(raw_text): # keep only words letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text) # convert to lower case and split words = letters_only_text.lower().split() # remove stopwords stopword_set = set(stopwords.words("english")) meaningful_words = [w for w in words if w not in stopword_set] #stemmed words ps = PorterStemmer() stemmed_words = [ps.stem(word) for word in meaningful_words] #join the cleaned words in a list cleaned_word_list = " ".join(stemmed_words) return cleaned_word_list
In [13]:
data1['NAME'] = data1['NAME'].apply(lambda line : preprocess(line))
In [14]:
data1.head()
NAME | CATEGORY | DESCRIPTION | |
---|---|---|---|
0 | fort collin men red solid pad jacket | Men Jackets Coats | Fort Collins Men Red Solid Padded Jacket, For... |
1 | mango man men navi blue tailor slim fit solid ... | Men Formal Trousers | MANGO MAN Men Navy Blue Tailored Slim Fit Soli... |
2 | arrow men navi blue taper fit check formal tro... | Men Formal Trousers | Arrow Men Navy Blue Tapered Fit Checked Formal... |
3 | hane charcoal grey thermal tshirt | Innerwear & Sleapwear | Hanes Charcoal Grey Thermal T Shirt, Hanes, T... |
4 | hancock men blue regular fit stripe formal shirt | Formal Shirts | Hancock Men Blue Regular Fit Striped Formal Sh... |
In [15]:
import networkx as nx import pandas as pd import numpy as np import random from tqdm import tqdm from sklearn.decomposition import PCA import matplotlib.pyplot as plt %matplotlib inline
Gabungan data pada CATEGORY dan NAME menjadi vocabulary
Membangun graph menggunakan vocabulary
In [16]:
G = nx.from_pandas_edgelist(data1, "CATEGORY", "NAME", edge_attr=True, create_using=nx.Graph())
Memeriksa jumlah node dalam graph
In [17]:
len(G)
Out [17]:
8125
Kami mendefinisikan fungsi yang akan mengambil node dan panjang path yang dilalui sebagai input. Fungsi akan berjalan melalui node yang terhubung dari input node yang ditentukan random walk. Lalu fungsi akan mengembalikan urutan node yang dilalui.
In [18]:
def get_randomwalk(node, path_length): random_walk = [node] for i in range(path_length-1): temp = list(G.neighbors(node)) temp = list(set(temp) - set(random_walk)) if len(temp) == 0: break random_node = random.choice(temp) random_walk.append(random_node) node = random_node return random_walk
Contoh fungsi untuk: Men Formal Trousers
In [19]:
get_randomwalk('Men Formal Trousers', 10)
Out [19]:
['Men Formal Trousers', 'invictu men black slim fit solid formal trouser']
Kami menentukan panjang path untuk dilintasi dengan nilai 10. Kami akan menangkap random walk untuk semua node dalam dataset kami.
In [20]:
# get list of all nodes from the graph all_nodes = list(G.nodes()) random_walks = [] for n in tqdm(all_nodes): for i in range(5): random_walks.append(get_randomwalk(n,10)) # count of sequences len(random_walks)
Out [20]:
100%|████████████████████████████████████████████████████████████████████████████| 8125/8125 [00:06<00:00, 1289.47it/s]
Dengan panjang path yang kami atur dengan nilai 10, maka didapatkan 40.625 urutan random walk. Urutan ini dapat digunakan sebagai input ke model skip-gram dan mengekstraksi bobot yang dipelajari oleh model (node embedding).
In [21]:
from gensim.models import Word2Vec import warnings warnings.filterwarnings('ignore')
Lalu kami melatih model skip-gram dengan random walk.
In [22]:
# train skip-gram (word2vec) model model = Word2Vec(window = 4, sg = 1, hs = 0, negative = 10, # for negative sampling alpha=0.03, min_alpha=0.0007, seed = 14) model.build_vocab(random_walks, progress_per=2) model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)
Out [22]:
(1778978, 2439200)
Setiap node dalam graph diwakili oleh vektor dengan panjang tetap (100). Sebagai contoh kita cari paling mirip dengan: "Formal Shirts".
In [23]:
model.similar_by_word('Formal Shirts')
Out [23]:
[('peter england men grey solid slim fit formal shirt', 0.8732825517654419), ('peter england men orang slim fit solid formal shirt', 0.8653634786605835), ('arrow new york men blue white slim fit check formal shirt', 0.864309549331665), ('van heusen men creamcolour regular fit solid formal shirt', 0.8582490682601929), ('van heusen men brown purpl slim fit selfdesign formal shirt', 0.8579122424125671), ('invictu men blue slim fit print formal shirt', 0.8563601970672607), ('red tape men black regular fit solid formal shirt', 0.8562281131744385), ('rg design men blue slim fit stripe linen formal shirt', 0.8559995293617249), ('van heusen men lavend slim fit check formal shirt', 0.8549967408180237), ('jainish men orang classic slim fit solid formal shirt', 0.8544467091560364)]
Contoh kita cari paling mirip dengan: "Accesories"
In [24]:
model.similar_by_word('Accesories')
Out [24]:
[('classic cl icon tape black cap', 0.8467520475387573), ('tossido grey check pattern tie', 0.8429974317550659), ('tommi hilfig men brown solid belt', 0.8407713174819946), ('tommi hilfig men navi blue brown revers solid leather belt', 0.8403265476226807), ('lino perro black solid broad tie', 0.8362069129943848), ('loui philipp men navi blue brown solid revers leather belt', 0.8343643546104431), ('hrx hrithik roshan unisex charcoal grey print beani', 0.8334780931472778), ('knotyy black solid unisex beani', 0.8293745517730713), ('invictu blue coffe brown check tie', 0.8290094137191772), ('scharf men brown solid leather belt', 0.8253196477890015)]
In [25]:
terms = ['Formal Shirts', 'Accesories', 'Casual Shirts','Men Casual Trousers', 'Men Formal Trousers', 'Men Jackets Coats','Men Swimwear', 'Men Suits']
In [26]:
def plot_nodes(word_list): X = model[word_list] # reduce dimensions to 2 pca = PCA(n_components=2) result = pca.fit_transform(X) plt.figure(figsize=(12,9)) # create a scatter plot of the projection plt.scatter(result[:, 0], result[:, 1]) for i, word in enumerate(word_list): plt.annotate(word, xy=(result[i, 0], result[i, 1])) plt.show()
In [27]:
plot_nodes(terms)