Jan Mühlnikel commited on
Commit
f17e764
1 Parent(s): ac6359f

sparse matrix changes

Browse files
functions/calc_matches.py CHANGED
@@ -1,6 +1,8 @@
1
  import pandas as pd
2
  import numpy as np
 
3
 
 
4
  def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
5
  # matching project2 can be nay project
6
  # indecies (rows) = project1
@@ -31,6 +33,45 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
31
  p2_df["similarity"] = top_values
32
 
33
  return p1_df, p2_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
 
 
1
  import pandas as pd
2
  import numpy as np
3
+ from scipy.sparse import csr_matrix, lil_matrix
4
 
5
+ """
6
  def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
7
  # matching project2 can be nay project
8
  # indecies (rows) = project1
 
33
  p2_df["similarity"] = top_values
34
 
35
  return p1_df, p2_df
36
+ """
37
+
38
+ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
39
+ # Ensure the matrix is in a suitable format for manipulation
40
+ if not isinstance(similarity_matrix, csr_matrix):
41
+ similarity_matrix = csr_matrix(similarity_matrix)
42
+
43
+ # Get indices from dataframes
44
+ filtered_df_indices = filtered_df.index.to_list()
45
+ project_df_indices = project_df.index.to_list()
46
+
47
+ # Efficiently zero out diagonal elements if necessary
48
+ if np.array_equal(filtered_df_indices, project_df_indices):
49
+ similarity_matrix = lil_matrix(similarity_matrix)
50
+ similarity_matrix.setdiag(0)
51
+ similarity_matrix = csr_matrix(similarity_matrix)
52
+
53
+ # Select submatrix based on indices from both dataframes
54
+ match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
55
+
56
+ # Get the linear indices of the top 'top_x' values
57
+ # (flattened index to handle the sparse matrix more effectively)
58
+ linear_indices = np.argsort(match_matrix.data)[-top_x:]
59
+ if len(linear_indices) < top_x:
60
+ top_x = len(linear_indices)
61
+
62
+ # Convert flat indices to 2D indices using the shape of the submatrix
63
+ top_indices = np.unravel_index(linear_indices, match_matrix.shape)
64
+
65
+ # Get the corresponding similarity values
66
+ top_values = match_matrix.data[linear_indices]
67
+
68
+ # Create resulting dataframes with top matches and their similarity scores
69
+ p1_df = filtered_df.iloc[top_indices[0]].copy()
70
+ p1_df['similarity'] = top_values
71
+ p2_df = project_df.iloc[top_indices[1]].copy()
72
+ p2_df['similarity'] = top_values
73
+
74
+ return p1_df, p2_df
75
 
76
 
77
 
functions/single_similar.py CHANGED
@@ -1,6 +1,8 @@
1
  import pandas as pd
2
  import numpy as np
 
3
 
 
4
  def find_similar(p_index, similarity_matrix, filtered_df, top_x):
5
 
6
  # filter out just projects from filtered df
@@ -21,5 +23,32 @@ def find_similar(p_index, similarity_matrix, filtered_df, top_x):
21
  result_df["similarity"] = top_10_values_descending
22
 
23
  return result_df
 
 
 
 
 
 
 
 
24
 
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
+ from scipy.sparse import csr_matrix
4
 
5
+ """
6
  def find_similar(p_index, similarity_matrix, filtered_df, top_x):
7
 
8
  # filter out just projects from filtered df
 
23
  result_df["similarity"] = top_10_values_descending
24
 
25
  return result_df
26
+ """
27
+ def find_similar(p_index, similarity_matrix, filtered_df, top_x):
28
+ # Ensure the similarity_matrix is in a suitable sparse format like CSR
29
+ if not isinstance(similarity_matrix, csr_matrix):
30
+ similarity_matrix = csr_matrix(similarity_matrix)
31
+
32
+ # Filter out just projects from filtered_df
33
+ filtered_indices = filtered_df.index.tolist()
34
 
35
+ # Create a mapping from new position to original indices
36
+ index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}
37
+
38
+ # Extract the submatrix corresponding to the filtered indices
39
+ filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]
40
 
41
+ # Extract the row for the selected project efficiently
42
+ # Convert the sparse row slice to a dense array for argsort function
43
+ project_row = filtered_column_sim_matrix.getrow(p_index).toarray().ravel()
44
+
45
+ # Find top_x indices with the highest similarity scores
46
+ sorted_indices = np.argsort(project_row)[-top_x:][::-1]
47
+ top_indices = [index_position_mapping[i] for i in sorted_indices]
48
+ top_values = project_row[sorted_indices]
49
+
50
+ # Prepare the result DataFrame
51
+ result_df = filtered_df.loc[top_indices]
52
+ result_df['similarity'] = top_values
53
+
54
+ return result_df
similarity_page.py CHANGED
@@ -28,21 +28,36 @@ def get_process_memory():
28
  """
29
 
30
  # Catch DATA
 
31
  # Load Similarity matrix
 
32
  @st.cache_data
33
  def load_sim_matrix():
34
  loaded_matrix = load_npz("src/extended_similarities.npz")
35
  dense_matrix = loaded_matrix.toarray().astype('float16')
36
 
37
  return dense_matrix
 
 
 
 
 
38
 
 
39
  # Load Non Similar Orga Matrix
 
40
  @st.cache_data
41
  def load_nonsameorga_sim_matrix():
42
  loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
43
  dense_matrix = loaded_matrix.toarray().astype('float16')
44
 
45
  return dense_matrix
 
 
 
 
 
 
46
 
47
  # Load Projects DFs
48
  @st.cache_data
 
28
  """
29
 
30
  # Catch DATA
31
+
32
  # Load Similarity matrix
33
+ """
34
  @st.cache_data
35
  def load_sim_matrix():
36
  loaded_matrix = load_npz("src/extended_similarities.npz")
37
  dense_matrix = loaded_matrix.toarray().astype('float16')
38
 
39
  return dense_matrix
40
+ """
41
+ @st.cache_data
42
+ def load_sim_matrix():
43
+ loaded_matrix = load_npz("src/extended_similarities.npz")
44
+ #dense_matrix = loaded_matrix.toarray().astype('float16')
45
 
46
+ return loaded_matrix
47
  # Load Non Similar Orga Matrix
48
+ """
49
  @st.cache_data
50
  def load_nonsameorga_sim_matrix():
51
  loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
52
  dense_matrix = loaded_matrix.toarray().astype('float16')
53
 
54
  return dense_matrix
55
+ """
56
+ def load_nonsameorga_sim_matrix():
57
+ loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
58
+ #dense_matrix = loaded_matrix.toarray().astype('float16')
59
+
60
+ return loaded_matrix
61
 
62
  # Load Projects DFs
63
  @st.cache_data
src/extended_similarities.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4de37759fc87a54415ca39e71b17a9a3b507091f3af401bf70d24bbf1a22aa9
3
- size 6888936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c8747d1e71428e191cab9b6bb7187a7ede099e83f722cd8dabd133b3e994ac4
3
+ size 2779951