Spaces:

somewheresystems
/

dataclysm

Paused

App Files Files Community

somewheresy commited on Jan 24, 2024

Commit

a7193d8

verified ·

1 Parent(s): 4fa8c7b

Upload app.py

Browse files

Files changed (1) hide show

app.py +48 -225

app.py CHANGED Viewed

@@ -8,56 +8,14 @@ from sklearn.cluster import KMeans
 import plotly.graph_objects as go
 import time
 import logging
-from sklearn.cluster import HDBSCAN
-BACKGROUND_COLOR = 'black'
-COLOR = 'white'
-def set_page_container_style(
-        max_width: int = 10000, max_width_100_percent: bool = False,
-        padding_top: int = 1, padding_right: int = 10, padding_left: int = 1, padding_bottom: int = 10,
-        color: str = COLOR, background_color: str = BACKGROUND_COLOR,
-    ):
-        if max_width_100_percent:
-            max_width_str = f'max-width: 100%;'
-        else:
-            max_width_str = f'max-width: {max_width}px;'
-        st.markdown(
-            f'''
-            <style>
-                .reportview-container .css-1lcbmhc .css-1outpf7 {{
-                    padding-top: 35px;
-                }}
-                .reportview-container .main .block-container {{
-                    {max_width_str}
-                    padding-top: {padding_top}rem;
-                    padding-right: {padding_right}rem;
-                    padding-left: {padding_left}rem;
-                    padding-bottom: {padding_bottom}rem;
-                }}
-                .reportview-container .main {{
-                    color: {color};
-                    background-color: {background_color};
-                }}
-            </style>
-            ''',
-            unsafe_allow_html=True,
-        )
 # Additional libraries for querying
 from FlagEmbedding import FlagModel
 # Global variables and dataset loading
 global dataset_name
-st.set_page_config(layout="wide")
-dataset_name = "somewheresystems/dataclysm-arxiv"
-set_page_container_style(
-        max_width = 1600, max_width_100_percent = True,
-        padding_top = 0, padding_right = 10, padding_left = 5, padding_bottom = 10
-)
 st.session_state.dataclysm_arxiv = load_dataset(dataset_name, split="train")
 total_samples = len(st.session_state.dataclysm_arxiv)
@@ -125,69 +83,20 @@ def perform_tsne(embeddings):
 def perform_clustering(df, tsne_results):
     start_time = time.time()
-    # Perform DBSCAN clustering
-    logging.info('Performing HDBSCAN clustering...')
     # Step 3: Visualization with Plotly
-    # Normalize the t-SNE results between 0 and 1
-    df['tsne-3d-one'] = (tsne_results[:,0] - tsne_results[:,0].min()) / (tsne_results[:,0].max() - tsne_results[:,0].min())
-    df['tsne-3d-two'] = (tsne_results[:,1] - tsne_results[:,1].min()) / (tsne_results[:,1].max() - tsne_results[:,1].min())
-    df['tsne-3d-three'] = (tsne_results[:,2] - tsne_results[:,2].min()) / (tsne_results[:,2].max() - tsne_results[:,2].min())
-    # Perform DBSCAN clustering
-    hdbscan = HDBSCAN(min_cluster_size=10, min_samples=50)
-    cluster_labels = hdbscan.fit_predict(df[['tsne-3d-one', 'tsne-3d-two', 'tsne-3d-three']])
-    df['cluster'] = cluster_labels
     end_time = time.time()  # End timing
-    st.sidebar.text(f'HDBSCAN clustering completed in {end_time - start_time:.3f} seconds')
     return df
-def update_camera_position(fig, df, df_query, result_id, K=10):
-    # Focus the camera on the closest result
-    top_K_ids = df_query.sort_values(by='proximity', ascending=True).head(K)['id'].tolist()
-    top_K_proximity = df_query['proximity'].tolist()
-    top_results = df[df['id'].isin(top_K_ids)]
-    camera_focus = dict(
-        eye=dict(x=top_results.iloc[0]['tsne-3d-one']*0.1, y=top_results.iloc[0]['tsne-3d-two']*0.1, z=top_results.iloc[0]['tsne-3d-three']*0.1)
-    )
-    # Normalize the proximity values to range between 1 and 10
-    normalized_proximity = [10 - (10 * (prox - min(top_K_proximity)) / (max(top_K_proximity) - min(top_K_proximity))) for prox in top_K_proximity]
-    # Create a dictionary mapping id to normalized proximity
-    id_to_proximity = dict(zip(top_K_ids, normalized_proximity))
-    # Set marker sizes based on proximity for top K ids, all other points stay the same
-    marker_sizes = [id_to_proximity[id] if id in top_K_ids else 1 for id in df['id']]
-    # Store the original colors in a separate column
-    df['color'] = df['cluster']
-    fig = go.Figure(data=[go.Scatter3d(
-        x=df['tsne-3d-one'],
-        y=df['tsne-3d-two'],
-        z=df['tsne-3d-three'],
-        mode='markers',
-        marker=dict(size=marker_sizes, color=df['color'], colorscale='Viridis', opacity=0.8, line_width=0),
-        hovertext=df['hovertext'],
-        hoverinfo='text',
-    )])
-    # Set grid opacity to 10%
-    fig.update_layout(scene = dict(xaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
-                                    yaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
-                                    zaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)')))
-    # Add lines stemming from the top result to all other points in the top K
-    for i in range(0, K):  # there are K-1 lines from the top result to the other K-1 points
-        fig.add_trace(go.Scatter3d(
-            x=[top_results.iloc[0]['tsne-3d-one'], top_results.iloc[i]['tsne-3d-one']],
-            y=[top_results.iloc[0]['tsne-3d-two'], top_results.iloc[i]['tsne-3d-two']],
-            z=[top_results.iloc[0]['tsne-3d-three'], top_results.iloc[i]['tsne-3d-three']],
-            mode='lines',
-            line=dict(color='white',width=0.4),  # Set line opacity to 50%
-            showlegend=False,
-            hoverinfo='none',
-        ))
-    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)',
-                paper_bgcolor='rgba(0,0,0,0)',
-                scene_camera=camera_focus)
-    return fig
 def main():
     # Custom CSS
     custom_css = """
@@ -203,126 +112,47 @@ def main():
             color: #F8F8F8; /* Set the font color to F8F8F8 */
         }
         /* Add your CSS styles here */
-        .stPlotlyChart {
-            width: 100%;
-            height: 100%;
-        /* Other styles... */
-        }
         h1 {
             text-align: center;
         }
         h2,h3,h4 {
             text-align: justify;
-            font-size: 8px;
-        }
-        st-emotion-cache-1wmy9hl {
-            font-size: 8px;
         }
         body {
-            color: #fff;
-            background-color: #202020;
         }
         .stSlider .css-1cpxqw2 {
             background: #202020;
-            color: #fd5137;
-        }
-        .stSlider .text {
-            background: #202020;
-            color: #fd5137;
         }
         .stButton > button {
             background-color: #202020;
-            width: 60%;
-            margin-left: auto;
-            margin-right: auto;
-            display: block;
             padding: 10px 24px;
             font-size: 16px;
             font-weight: bold;
-            border: 1px solid #f8f8f8;
-        }
-        .stButton > button:hover {
-            color: #Fd5137
-            border: 1px solid #fd5137;
-        }
-        .stButton > button:active {
-            color: #F8F8F8;
-            border: 1px solid #fd5137;
-            background-color: #fd5137;
         }
         .reportview-container .main .block-container {
-            padding: 0;
             background-color: #202020;
-            width: 100%; /* Make the plotly graph take up full width */
-        }
-        .sidebar .sidebar-content {
-            background-image: linear-gradient(#202020,#202020);
-            color: white;
-            size: 0.2em; /* Make the text in the sidebar smaller */
-            padding: 0;
-        }
-        .reportview-container .main .block-container {
-            background-color: #000000;
-        }
-        .stText {
-            padding: 0;
-        }
-        /* Set the main background color to #202020 */
-        .appview-container {
-            background-color: #000000;
-            padding: 0;
-        }
-        .stVerticalBlockBorderWrapper{
-            padding: 0;
-            margin-left: 0px;
-        }
-        .st-emotion-cache-1cypcdb {
-            background-color: #202020;
-            background-image: none;
-            color: #000000;
-            padding: 0;
-        }
-        .stPlotlyChart {
-            background-color: #000000;
-            background-image: none;
-            color: #000000;
-            padding: 0;
-        }
-        .reportview-container .css-1lcbmhc .css-1outpf7 {
-            padding-top: 35px;
-        }
-        .reportview-container .main .block-container {
-            max-width: 100%;
-            padding-top: 0rem;
-            padding-right: 0rem;
-            padding-left: 0rem;
-            padding-bottom: 10rem;
-        }
-        .reportview-container .main {
-            color: white;
-            background-color: black;
-        }
-        .stHeader {
-            color: black;
-            background-color: black;
         }
     </style>
     """
     # Inject custom CSS with markdown
     st.markdown(custom_css, unsafe_allow_html=True)
-    st.sidebar.title('Spatial Search Engine')
     st.sidebar.markdown(
-        '<a href="http://dataclysm.xyz" target="_blank" style="display: flex; justify-content: center; padding: 10px;">dataclysm.xyz <img src="https://www.somewhere.systems/S2-white-logo.png" style="width: 8px; height: 8px;"></a>',
         unsafe_allow_html=True
     )
     # Check if data needs to be loaded
     if 'data_loaded' not in st.session_state or not st.session_state.data_loaded:
         # User input for number of samples
-        num_samples = st.sidebar.slider('Select number of samples', 1000, int(round(total_samples/10)), 1000)
         if st.sidebar.button('Initialize'):
             st.sidebar.text('Initializing data pipeline...')
@@ -341,6 +171,8 @@ def main():
                 print(f"FAISS index for {column_name} added.")
                 return dataset
             # Load data and perform t-SNE and clustering
             df, embeddings = load_data(num_samples)
@@ -377,21 +209,21 @@ def main():
                 marker=dict(
                     size=1,
                     color=df['cluster'],
-                    colorscale='Jet',
-                    opacity=0.75
                 )
             )])
-            # Set grid opacity to 10%
-            fig.update_layout(scene = dict(xaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
-                                           yaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
-                                           zaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)')))
             fig.update_layout(
-                plot_bgcolor='rgba(0,0,0,0)',
-                paper_bgcolor='rgba(0,0,0,0)',
                 height=800,
                 margin=dict(l=0, r=0, b=0, t=0),
-                scene_camera=dict(eye=dict(x=0.1, y=0.1, z=0.1))
             )
             st.session_state.fig = fig
@@ -404,19 +236,8 @@ def main():
     if 'df' in st.session_state:
         # Sidebar for querying
         with st.sidebar:
-            st.sidebar.markdown("# Detailed View")
-            selected_index = st.sidebar.selectbox("Select Key", st.session_state.df.id)
-            # Display metadata for the selected article
-            selected_row = st.session_state.df[st.session_state.df['id'] == selected_index].iloc[0]
-            st.markdown(f"### Title\n{selected_row['title']}", unsafe_allow_html=True)
-            st.markdown(f"### Abstract\n{selected_row['abstract']}", unsafe_allow_html=True)
-            st.markdown(f"[Read the full paper](https://arxiv.org/abs/{selected_row['id']})", unsafe_allow_html=True)
-            st.markdown(f"[Download PDF](https://arxiv.org/pdf/{selected_row['id']})", unsafe_allow_html=True)
-            st.sidebar.markdown("### Find Similar in Latent Space")
-            query = st.text_input("", value=selected_row['title'])
-            top_k = st.slider("top k", 1, 100, 10)
             if st.button("Search"):
                 # Define the model
                 print("Initializing model...")
@@ -427,7 +248,7 @@ def main():
                 query_embedding = model.encode([query])
                 # Retrieve examples by title similarity (or abstract, depending on your preference)
-                scores_title, retrieved_examples_title = st.session_state.dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=top_k)
                 df_query = pd.DataFrame(retrieved_examples_title)
                 df_query['proximity'] = scores_title
                 df_query = df_query.sort_values(by='proximity', ascending=True)
@@ -436,17 +257,19 @@ def main():
                 # Fix the <a href link> to display properly
                 df_query['URL'] = df_query['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}" target="_blank">Link</a>')
                 st.sidebar.markdown(df_query[['title', 'proximity', 'id']].to_html(escape=False), unsafe_allow_html=True)
-                # Get the ID of the top search result
-                top_result_id = df_query.iloc[0]['id']
-                # Update the camera position and appearance of points
-                updated_fig = update_camera_position(st.session_state.fig, st.session_state.df, df_query, top_result_id,top_k)
-                # Update the figure in the session state and redraw the plot in the sidebar
-                st.session_state.fig = updated_fig
-    # Display the plot if data is loaded
-    if 'data_loaded' in st.session_state and st.session_state.data_loaded:
-        st.plotly_chart(st.session_state.fig, use_container_width=True)
 if __name__ == "__main__":
-    main()

 import plotly.graph_objects as go
 import time
 import logging
 # Additional libraries for querying
 from FlagEmbedding import FlagModel
 # Global variables and dataset loading
 global dataset_name
+dataset_name = 'somewheresystems/dataclysm-arxiv'
 st.session_state.dataclysm_arxiv = load_dataset(dataset_name, split="train")
 total_samples = len(st.session_state.dataclysm_arxiv)
 def perform_clustering(df, tsne_results):
     start_time = time.time()
+    # Perform KMeans clustering
+    logging.info('Performing k-means clustering...')
     # Step 3: Visualization with Plotly
+    df['tsne-3d-one'] = tsne_results[:,0]
+    df['tsne-3d-two'] = tsne_results[:,1]
+    df['tsne-3d-three'] = tsne_results[:,2]
+    # Perform KMeans clustering
+    kmeans = KMeans(n_clusters=16)  # Change the number of clusters as needed
+    df['cluster'] = kmeans.fit_predict(df[['tsne-3d-one', 'tsne-3d-two', 'tsne-3d-three']])
     end_time = time.time()  # End timing
+    st.sidebar.text(f'k-means clustering completed in {end_time - start_time:.3f} seconds')
     return df
 def main():
     # Custom CSS
     custom_css = """
             color: #F8F8F8; /* Set the font color to F8F8F8 */
         }
         /* Add your CSS styles here */
         h1 {
             text-align: center;
         }
         h2,h3,h4 {
             text-align: justify;
+            font-size: 8px
         }
         body {
+            text-align: justify;
         }
         .stSlider .css-1cpxqw2 {
             background: #202020;
         }
         .stButton > button {
             background-color: #202020;
+            width: 100%;
+            border: none;
             padding: 10px 24px;
+            border-radius: 5px;
             font-size: 16px;
             font-weight: bold;
         }
         .reportview-container .main .block-container {
+            padding: 2rem;
             background-color: #202020;
         }
     </style>
     """
     # Inject custom CSS with markdown
     st.markdown(custom_css, unsafe_allow_html=True)
     st.sidebar.markdown(
+        f'<img src="https://www.somewhere.systems/S2-white-logo.png" style="float: bottom-left; width: 32px; height: 32px; opacity: 1.0; animation: fadein 2s;">',
         unsafe_allow_html=True
     )
+    st.sidebar.title('Spatial Search Engine')
     # Check if data needs to be loaded
     if 'data_loaded' not in st.session_state or not st.session_state.data_loaded:
         # User input for number of samples
+        num_samples = st.sidebar.slider('Select number of samples', 1000, total_samples, 1000)
         if st.sidebar.button('Initialize'):
             st.sidebar.text('Initializing data pipeline...')
                 print(f"FAISS index for {column_name} added.")
                 return dataset
             # Load data and perform t-SNE and clustering
             df, embeddings = load_data(num_samples)
                 marker=dict(
                     size=1,
                     color=df['cluster'],
+                    colorscale='Viridis',
+                    opacity=0.8
                 )
             )])
             fig.update_layout(
+                plot_bgcolor='#202020',
                 height=800,
                 margin=dict(l=0, r=0, b=0, t=0),
+                scene=dict(
+                    xaxis=dict(showbackground=True, backgroundcolor="#000000"),
+                    yaxis=dict(showbackground=True, backgroundcolor="#000000"),
+                    zaxis=dict(showbackground=True, backgroundcolor="#000000"),
+                ),
+                scene_camera=dict(eye=dict(x=0.001, y=0.001, z=0.001))
             )
             st.session_state.fig = fig
     if 'df' in st.session_state:
         # Sidebar for querying
         with st.sidebar:
+            st.sidebar.markdown("### Query Embeddings")
+            query = st.text_input("Enter your query:")
             if st.button("Search"):
                 # Define the model
                 print("Initializing model...")
                 query_embedding = model.encode([query])
                 # Retrieve examples by title similarity (or abstract, depending on your preference)
+                scores_title, retrieved_examples_title = st.session_state.dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
                 df_query = pd.DataFrame(retrieved_examples_title)
                 df_query['proximity'] = scores_title
                 df_query = df_query.sort_values(by='proximity', ascending=True)
                 # Fix the <a href link> to display properly
                 df_query['URL'] = df_query['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}" target="_blank">Link</a>')
                 st.sidebar.markdown(df_query[['title', 'proximity', 'id']].to_html(escape=False), unsafe_allow_html=True)
+            st.sidebar.markdown("# Detailed View")
+            selected_index = st.sidebar.selectbox("Select Key", st.session_state.df.id)
+            # Display metadata for the selected article
+            selected_row = st.session_state.df[st.session_state.df['id'] == selected_index].iloc[0]
+            st.markdown(f"### Title\n{selected_row['title']}", unsafe_allow_html=True)
+            st.markdown(f"### Abstract\n{selected_row['abstract']}", unsafe_allow_html=True)
+            st.markdown(f"[Read the full paper](https://arxiv.org/abs/{selected_row['id']})", unsafe_allow_html=True)
+            st.markdown(f"[Download PDF](https://arxiv.org/pdf/{selected_row['id']})", unsafe_allow_html=True)
 if __name__ == "__main__":
+    main()