import pandas as pd
import matplotlib. pyplot as plt
import sklearn. metrics as metrics
import numpy as np
from sklearn. neighbors import NearestNeighbors
from scipy. spatial. distance import correlation
from sklearn. metrics. pairwise import pairwise_distances
import ipywidgets as widgets
from IPython. display import display, clear_output
from contextlib import contextmanager
import warnings
warnings. filterwarnings( 'ignore' )
import numpy as np
import os, sys
import re
import seaborn as sns
books = pd. read_csv( 'F:\\data\\bleeding_data\\BX-Books.csv' ,
sep= None , encoding= "latin-1" )
books. columns = [ 'ISBN' , 'bookTitle' , 'bookAuthor' ,
'yearOfPublication' , 'publisher' ,
'imageUrlS' , 'imageUrlM' , 'imageUrlL' ]
users = pd. read_csv( 'F:\\data\\bleeding_data\\BX-Users.csv' ,
sep= None , encoding= "latin-1" )
users. columns = [ 'userID' , 'Location' , 'Age' ]
ratings = pd. read_csv( 'F:\\data\\bleeding_data\\BX-Book-Ratings.csv' ,
sep= None , encoding= "latin-1" )
ratings. columns = [ 'userID' , 'ISBN' , 'bookRating' ]
print ( books. shape)
print ( users. shape)
print ( ratings. shape)
(271360, 8)
(278858, 3)
(1149780, 3)
books. head( )
ISBN bookTitle bookAuthor yearOfPublication publisher imageUrlS imageUrlM imageUrlL 0 0195153448 Classical Mythology Mark P. O. Morford 2002 Oxford University Press http://images.amazon.com/images/P/0195153448.0... http://images.amazon.com/images/P/0195153448.0... http://images.amazon.com/images/P/0195153448.0... 1 0002005018 Clara Callan Richard Bruce Wright 2001 HarperFlamingo Canada http://images.amazon.com/images/P/0002005018.0... http://images.amazon.com/images/P/0002005018.0... http://images.amazon.com/images/P/0002005018.0... 2 0060973129 Decision in Normandy Carlo D'Este 1991 HarperPerennial http://images.amazon.com/images/P/0060973129.0... http://images.amazon.com/images/P/0060973129.0... http://images.amazon.com/images/P/0060973129.0... 3 0374157065 Flu: The Story of the Great Influenza Pandemic... Gina Bari Kolata 1999 Farrar Straus Giroux http://images.amazon.com/images/P/0374157065.0... http://images.amazon.com/images/P/0374157065.0... http://images.amazon.com/images/P/0374157065.0... 4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0...
books. drop( [ 'imageUrlS' , 'imageUrlM' , 'imageUrlL' ] , axis= 1 , inplace= True )
books. head( )
ISBN bookTitle bookAuthor yearOfPublication publisher 0 0195153448 Classical Mythology Mark P. O. Morford 2002 Oxford University Press 1 0002005018 Clara Callan Richard Bruce Wright 2001 HarperFlamingo Canada 2 0060973129 Decision in Normandy Carlo D'Este 1991 HarperPerennial 3 0374157065 Flu: The Story of the Great Influenza Pandemic... Gina Bari Kolata 1999 Farrar Straus Giroux 4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company
books. dtypes
ISBN object
bookTitle object
bookAuthor object
yearOfPublication object
publisher object
dtype: object
books. bookTitle. unique( )
array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
'Lily Dale : The True Story of the Town that Talks to the Dead',
"Republic (World's Classics)",
"A Guided Tour of Rene Descartes' Meditations on First Philosophy with Complete Translations of the Meditations by Ronald Rubin"],
dtype=object)
books. yearOfPublication. unique( )
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
'2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
'1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
'1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
'1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
'1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
'1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
'1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
'1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
'2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
'1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
'1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
'1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
'2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
'2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
dtype=object)
books. loc[ books. yearOfPublication == 'DK Publishing Inc' , : ]
books. yearOfPublication. unique( )
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
'2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
'1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
'1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
'1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
'1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
'1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
'1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
'1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
'2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
'1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
'1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
'1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
'2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
'2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
dtype=object)
print ( books. loc[ books. yearOfPublication == 'DK Publishing Inc' , : ] )
ISBN bookTitle \
209538 078946697X DK Readers: Creating the X-Men, How It All Beg...
221678 0789466953 DK Readers: Creating the X-Men, How Comic Book...
bookAuthor yearOfPublication \
209538 2000 DK Publishing Inc
221678 2000 DK Publishing Inc
publisher
209538 http://images.amazon.com/images/P/078946697X.0...
221678 http://images.amazon.com/images/P/0789466953.0...
books. loc[ books. yearOfPublication == 'DK Publishing Inc' , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 209538 078946697X DK Readers: Creating the X-Men, How It All Beg... 2000 DK Publishing Inc http://images.amazon.com/images/P/078946697X.0... 221678 0789466953 DK Readers: Creating the X-Men, How Comic Book... 2000 DK Publishing Inc http://images.amazon.com/images/P/0789466953.0...
books. loc[ books. ISBN == '0789466953' , 'yearOfPublication' ] = 2000
books. loc[ books. ISBN == '0789466953' , 'bookAuthor' ] = "James Buckley"
books. loc[ books. ISBN == '0789466953' , 'publisher' ] = "DK Publishing Inc"
books. loc[ books. ISBN == '0789466953' , 'bookTitle' ] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"
books. loc[ books. ISBN == '078946697X' , 'yearOfPublication' ] = 2000
books. loc[ books. ISBN == '078946697X' , 'bookAuthor' ] = "Michael Teitelbaum"
books. loc[ books. ISBN == '078946697X' , 'publisher' ] = "DK Publishing Inc"
books. loc[ books. ISBN == '078946697X' , 'bookTitle' ] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"
books. loc[ ( books. ISBN == '0789466953' ) | ( books. ISBN == '078946697X' ) , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 209538 078946697X DK Readers: Creating the X-Men, How It All Beg... Michael Teitelbaum 2000 DK Publishing Inc 221678 0789466953 DK Readers: Creating the X-Men, How Comic Book... James Buckley 2000 DK Publishing Inc
books. yearOfPublication= pd. to_numeric( books. yearOfPublication, errors= 'coerce' )
sorted ( books[ 'yearOfPublication' ] . unique( ) )
[0.0,
1376.0,
1378.0,
1806.0,
1897.0,
1900.0,
1901.0,
1902.0,
1904.0,
1906.0,
1908.0,
1909.0,
1910.0,
1911.0,
1914.0,
1917.0,
1919.0,
1920.0,
1921.0,
1922.0,
1923.0,
1924.0,
1925.0,
1926.0,
1927.0,
1928.0,
1929.0,
1930.0,
1931.0,
1932.0,
1933.0,
1934.0,
1935.0,
1936.0,
1937.0,
1938.0,
1939.0,
1940.0,
1941.0,
1942.0,
1943.0,
1944.0,
1945.0,
1946.0,
1947.0,
1948.0,
1949.0,
1950.0,
1951.0,
1952.0,
1953.0,
1954.0,
1955.0,
1956.0,
1957.0,
1958.0,
1959.0,
1960.0,
1961.0,
1962.0,
1963.0,
1964.0,
1965.0,
1966.0,
1967.0,
1968.0,
1969.0,
1970.0,
1971.0,
1972.0,
1973.0,
1974.0,
1975.0,
1976.0,
1977.0,
1978.0,
1979.0,
1980.0,
1981.0,
1982.0,
1983.0,
1984.0,
1985.0,
1986.0,
1987.0,
1988.0,
1989.0,
1990.0,
1991.0,
1992.0,
1993.0,
1994.0,
1995.0,
1996.0,
1997.0,
1998.0,
1999.0,
2000.0,
2001.0,
2002.0,
2003.0,
2004.0,
2005.0,
2006.0,
2008.0,
2010.0,
2011.0,
2012.0,
2020.0,
2021.0,
2024.0,
2026.0,
2030.0,
2037.0,
2038.0,
2050.0,
nan]
books. loc[ ( books. yearOfPublication > 2006 ) | ( books. yearOfPublication == 0 ) , 'yearOfPublication' ] = np. NAN
books. yearOfPublication. fillna( round ( books. yearOfPublication. mean( ) ) , inplace= True )
books. yearOfPublication. isnull( ) . sum ( )
0
books. yearOfPublication = books. yearOfPublication. astype( np. int32)
books. loc[ books. publisher. isnull( ) , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN 129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN
books. loc[ ( books. bookTitle == 'Tyrant Moon' ) , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN
books. loc[ ( books. bookTitle == 'Finders Keepers' ) , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 10799 082177364X Finders Keepers Fern Michaels 2002 Zebra Books 42019 0070465037 Finders Keepers Barbara Nickolae 1989 McGraw-Hill Companies 58264 0688118461 Finders Keepers Emily Rodda 1993 Harpercollins Juvenile Books 66678 1575663236 Finders Keepers Fern Michaels 1998 Kensington Publishing Corporation 129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN 134309 0156309505 Finders Keepers Will 1989 Voyager Books 173473 0973146907 Finders Keepers Sean M. Costello 2002 Red Tower Publications 195885 0061083909 Finders Keepers Sharon Sala 2003 HarperTorch 211874 0373261160 Finders Keepers Elizabeth Travis 1993 Worldwide Library
books. loc[ ( books. bookAuthor == 'Elaine Corvidae' ) , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 126762 1931696934 Winter's Orphans Elaine Corvidae 2001 Novelbooks 128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN 129001 0759901880 Wolfkin Elaine Corvidae 2001 Hard Shell Word Factory
books. loc[ ( books. bookAuthor == 'Linnea Sinclair' ) , : ]
ISBN bookTitle bookAuthor yearOfPublication publisher 129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN
books. loc[ ( books. ISBN == '193169656X' ) , 'publisher' ] = 'other'
books. loc[ ( books. ISBN == '1931696993' ) , 'publisher' ] = 'other'
print ( users. shape)
users. head( )
(278858, 3)
userID Location Age 0 1 nyc, new york, usa NaN 1 2 stockton, california, usa 18.0 2 3 moscow, yukon territory, russia NaN 3 4 porto, v.n.gaia, portugal 17.0 4 5 farnborough, hants, united kingdom NaN
users. dtypes
userID int64
Location object
Age float64
dtype: object
users. userID. values
array([ 1, 2, 3, ..., 278856, 278857, 278858], dtype=int64)
sorted ( users. Age. unique( ) )
[nan,
0.0,
1.0,
2.0,
3.0,
4.0,
5.0,
6.0,
7.0,
8.0,
9.0,
10.0,
11.0,
12.0,
13.0,
14.0,
15.0,
16.0,
17.0,
18.0,
19.0,
20.0,
21.0,
22.0,
23.0,
24.0,
25.0,
26.0,
27.0,
28.0,
29.0,
30.0,
31.0,
32.0,
33.0,
34.0,
35.0,
36.0,
37.0,
38.0,
39.0,
40.0,
41.0,
42.0,
43.0,
44.0,
45.0,
46.0,
47.0,
48.0,
49.0,
50.0,
51.0,
52.0,
53.0,
54.0,
55.0,
56.0,
57.0,
58.0,
59.0,
60.0,
61.0,
62.0,
63.0,
64.0,
65.0,
66.0,
67.0,
68.0,
69.0,
70.0,
71.0,
72.0,
73.0,
74.0,
75.0,
76.0,
77.0,
78.0,
79.0,
80.0,
81.0,
82.0,
83.0,
84.0,
85.0,
86.0,
87.0,
88.0,
89.0,
90.0,
91.0,
92.0,
93.0,
94.0,
95.0,
96.0,
97.0,
98.0,
99.0,
100.0,
101.0,
102.0,
103.0,
104.0,
105.0,
106.0,
107.0,
108.0,
109.0,
110.0,
111.0,
113.0,
114.0,
115.0,
116.0,
118.0,
119.0,
123.0,
124.0,
127.0,
128.0,
132.0,
133.0,
136.0,
137.0,
138.0,
140.0,
141.0,
143.0,
146.0,
147.0,
148.0,
151.0,
152.0,
156.0,
157.0,
159.0,
162.0,
168.0,
172.0,
175.0,
183.0,
186.0,
189.0,
199.0,
200.0,
201.0,
204.0,
207.0,
208.0,
209.0,
210.0,
212.0,
219.0,
220.0,
223.0,
226.0,
228.0,
229.0,
230.0,
231.0,
237.0,
239.0,
244.0]
users. loc[ ( users. Age > 90 ) | ( users. Age < 5 ) , 'Age' ] = np. nan
users. Age = users. Age. fillna( users. Age. mean( ) )
users. Age = users. Age. astype( np. int32)
sorted ( users. Age. unique( ) )
[5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90]
ratings. shape
(1149780, 3)
n_users = users. shape[ 0 ]
n_books = books. shape[ 0 ]
print ( n_users * n_books)
75670906880
ratings. head( 5 )
userID ISBN bookRating 0 276725 034545104X 0 1 276726 0155061224 5 2 276727 0446520802 0 3 276729 052165615X 3 4 276729 0521795028 6
ratings. bookRating. unique( )
array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)
ratings_new = ratings[ ratings. ISBN. isin( books. ISBN) ]
print ( ratings. shape)
print ( ratings_new. shape)
(1149780, 3)
(1031136, 3)
print ( "number of users: " + str ( n_users) )
print ( "number of books: " + str ( n_books) )
number of users: 278858
number of books: 271360
sparsity= 1.0 - len ( ratings_new) / float ( n_users* n_books)
print ( '图书交叉数据集的稀疏级别是 ' + str ( sparsity* 100 ) + ' %' )
图书交叉数据集的稀疏级别是 99.99863734155898 %
ratings. bookRating. unique( )
array([ 0, 5, 3, 6, 8, 7, 10, 9, 4, 1, 2], dtype=int64)
ratings_explicit = ratings_new[ ratings_new. bookRating != 0 ]
ratings_implicit = ratings_new[ ratings_new. bookRating == 0 ]
print ( ratings_new. shape)
print ( ratings_explicit. shape)
print ( ratings_implicit. shape)
(1031136, 3)
(383842, 3)
(647294, 3)
sns. countplot( data= ratings_explicit , x= 'bookRating' )
plt. show( )
ratings_count = pd. DataFrame( ratings_explicit. groupby( [ 'ISBN' ] ) [ 'bookRating' ] . sum ( ) )
top10 = ratings_count. sort_values( 'bookRating' , ascending = False ) . head( 10 )
print ( "推荐下列书籍" )
top10. merge( books, left_index = True , right_on = 'ISBN' )
推荐下列书籍
bookRating ISBN bookTitle bookAuthor yearOfPublication publisher 408 5787 0316666343 The Lovely Bones: A Novel Alice Sebold 2002 Little, Brown 748 4108 0385504209 The Da Vinci Code Dan Brown 2003 Doubleday 522 3134 0312195516 The Red Tent (Bestselling Backlist) Anita Diamant 1998 Picador USA 2143 2798 059035342X Harry Potter and the Sorcerer's Stone (Harry P... J. K. Rowling 1999 Arthur A. Levine Books 356 2595 0142001740 The Secret Life of Bees Sue Monk Kidd 2003 Penguin Books 26 2551 0971880107 Wild Animus Rich Shapero 2004 Too Far 1105 2524 0060928336 Divine Secrets of the Ya-Ya Sisterhood: A Novel Rebecca Wells 1997 Perennial 706 2402 0446672211 Where the Heart Is (Oprah's Book Club (Paperba... Billie Letts 1998 Warner Books 231 2219 0452282152 Girl with a Pearl Earring Tracy Chevalier 2001 Plume Books 118 2179 0671027360 Angels & Demons Dan Brown 2001 Pocket Star
users_exp_ratings = users[ users. userID. isin( ratings_explicit. userID) ]
users_imp_ratings = users[ users. userID. isin( ratings_implicit. userID) ]
print ( users. shape)
print ( users_exp_ratings. shape)
print ( users_imp_ratings. shape)
(278858, 3)
(68091, 3)
(52451, 3)
counts1 = ratings_explicit[ 'userID' ] . value_counts( )
ratings_explicit = ratings_explicit[ ratings_explicit[ 'userID' ] . isin( counts1[ counts1 >= 100 ] . index) ]
counts = ratings_explicit[ 'bookRating' ] . value_counts( )
ratings_explicit = ratings_explicit[ ratings_explicit[ 'bookRating' ] . isin( counts[ counts >= 100 ] . index) ]
ratings_matrix = ratings_explicit. pivot( index= 'userID' , columns= 'ISBN' , values= 'bookRating' )
userID = ratings_matrix. index
ISBN = ratings_matrix. columns
print ( ratings_matrix. shape)
ratings_matrix. head( )
(449, 66574)
ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 000104799X 0001048082 0001053736 0001053744 0001055607 ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A userID 2033 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2110 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2276 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4017 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4385 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 rows × 66574 columns
n_users = ratings_matrix. shape[ 0 ]
n_books = ratings_matrix. shape[ 1 ]
print ( n_users, n_books)
449 66574
ratings_matrix. fillna( 0 , inplace = True )
ratings_matrix = ratings_matrix. astype( np. int32)
ratings_matrix. head( 5 )
ISBN 0000913154 0001046438 000104687X 0001047213 0001047973 000104799X 0001048082 0001053736 0001053744 0001055607 ... B000092Q0A B00009EF82 B00009NDAN B0000DYXID B0000T6KHI B0000VZEJQ B0000X8HIE B00013AX9E B0001I1KOG B000234N3A userID 2033 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2110 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2276 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4017 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4385 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 rows × 66574 columns
sparsity= 1.0 - len ( ratings_explicit) / float ( users_exp_ratings. shape[ 0 ] * n_books)
print ( '图书交叉数据集的稀疏级别是 ' + str ( sparsity* 100 ) + ' %' )
图书交叉数据集的稀疏级别是 99.99772184106935 %
global metric, k
k= 10
metric= 'cosine'
def findksimilarusers ( user_id, ratings, metric = metric, k= k) :
similarities= [ ]
indices= [ ]
model_knn = NearestNeighbors( metric = metric, algorithm = 'brute' )
model_knn. fit( ratings)
loc = ratings. index. get_loc( user_id)
distances, indices = model_knn. kneighbors( ratings. iloc[ loc, : ] . values. reshape( 1 , - 1 ) , n_neighbors = k+ 1 )
similarities = 1 - distances. flatten( )
return similarities, indices
def predict_userbased ( user_id, item_id, ratings, metric = metric, k= k) :
prediction= 0
user_loc = ratings. index. get_loc( user_id)
item_loc = ratings. columns. get_loc( item_id)
similarities, indices= findksimilarusers( user_id, ratings, metric, k)
mean_rating = ratings. iloc[ user_loc, : ] . mean( )
sum_wt = np. sum ( similarities) - 1
product= 1
wtd_sum = 0
for i in range ( 0 , len ( indices. flatten( ) ) ) :
if indices. flatten( ) [ i] == user_loc:
continue ;
else :
ratings_diff = ratings. iloc[ indices. flatten( ) [ i] , item_loc] - np. mean( ratings. iloc[ indices. flatten( ) [ i] , : ] )
product = ratings_diff * ( similarities[ i] )
wtd_sum = wtd_sum + product
if prediction <= 0 :
prediction = 1
elif prediction > 10 :
prediction = 10
prediction = int ( round ( mean_rating + ( wtd_sum/ sum_wt) ) )
print ( '用户预测等级 {0} -> item {1}: {2}' . format ( user_id, item_id, prediction) )
return prediction
predict_userbased( 11676 , '0001056107' , ratings_matrix)
用户预测等级 11676 -> item 0001056107: 2
2
def findksimilaritems ( item_id, ratings, metric= metric, k= k) :
similarities= [ ]
indices= [ ]
ratings= ratings. T
loc = ratings. index. get_loc( item_id)
model_knn = NearestNeighbors( metric = metric, algorithm = 'brute' )
model_knn. fit( ratings)
distances, indices = model_knn. kneighbors( ratings. iloc[ loc, : ] . values. reshape( 1 , - 1 ) , n_neighbors = k+ 1 )
similarities = 1 - distances. flatten( )
return similarities, indices
def predict_itembased ( user_id, item_id, ratings, metric = metric, k= k) :
prediction= wtd_sum = 0
user_loc = ratings. index. get_loc( user_id)
item_loc = ratings. columns. get_loc( item_id)
similarities, indices= findksimilaritems( item_id, ratings)
sum_wt = np. sum ( similarities) - 1
product= 1
for i in range ( 0 , len ( indices. flatten( ) ) ) :
if indices. flatten( ) [ i] == item_loc:
continue ;
else :
product = ratings. iloc[ user_loc, indices. flatten( ) [ i] ] * ( similarities[ i] )
wtd_sum = wtd_sum + product
prediction = int ( round ( wtd_sum/ sum_wt) )
if prediction <= 0 :
prediction = 1
elif prediction > 10 :
prediction = 10
print ( '用户预测等级 {0} -> item {1}: {2}' . format ( user_id, item_id, prediction) )
return prediction
prediction = predict_itembased( 11676 , '0001056107' , ratings_matrix)
用户预测等级 11676 -> item 0001056107: 1