Spaces:

Hack90
/

virus_explorer

Sleeping

App Files Files Community

Hack90 commited on Sep 5, 2024

Commit

d1f4671

verified ·

1 Parent(s): d9d2b55

Update utils.py

Browse files

Files changed (1) hide show

utils.py +114 -0

utils.py CHANGED Viewed

@@ -300,6 +300,120 @@ def wens_method_heatmap(df, virus_species):
     return fig
 ############################################################# ColorSquare ########################################################

     return fig
+############################################################# Sub-Specie ########################################################
+import numpy as np
+from scipy.interpolate import interp1d, CubicSpline
+import pandas as pd
+from tqdm import tqdm
+# Define constants
+MIN_DISTANCE = 2581
+VECTORS = {
+    'A': [0.5, -0.8660254],
+    'T': [0.5, 0.8660254],
+    'G': [0.8660254, -0.5],
+    'C': [0.8660254, 0.5]
+}
+def create_dna_representation_ew_subs(seq):
+    """Create a 2D representation of DNA sequence using cubic spline interpolation."""
+    # Clean the sequence
+    clean_seq = ''.join(char for char in seq if char in VECTORS)
+    # Convert sequence to numerical representation
+    num_seq = np.array([VECTORS[char] for char in clean_seq], dtype=float)
+    # Calculate cumulative sum
+    cum_sum = num_seq.cumsum(axis=0)
+    # Perform cubic spline interpolation
+    x = np.arange(len(cum_sum))
+    cs_x = CubicSpline(x, cum_sum[:, 0])
+    cs_y = CubicSpline(x, cum_sum[:, 1])
+    # Interpolate to 2048 points
+    x_new = np.linspace(0, len(cum_sum) - 1, 2048)
+    return np.column_stack([cs_x(x_new), cs_y(x_new)]).tolist()
+def create_dna_representation_for_subs(row):
+    """Create a 1D representation of DNA sequence using linear interpolation."""
+    min_distance = int(row['min_distance'])
+    seq = ''.join(char for char in row['seq'] if char in VECTORS)[:min_distance]
+    min_distance = int(min_distance * 0.66)
+    # Convert sequence to numerical representation
+    num_seq = np.array([VECTORS[char] for char in seq], dtype=float)
+    # Calculate cumulative sum
+    cum_sum = num_seq.cumsum(axis=0)
+    # Perform linear interpolation
+    f = interp1d(cum_sum[:, 0], cum_sum[:, 1], kind='cubic', fill_value='extrapolate')
+    x_new = np.linspace(0, min_distance - 1, min_distance)
+    return f(x_new)
+def create_groups_subs(closest_matches):
+    """Create groups based on closest matches."""
+    groups = {}
+    visited = set()
+    def dfs(node, group):
+        if node in visited:
+            return
+        visited.add(node)
+        group.add(node)
+        for neighbor in closest_matches[node]:
+            dfs(neighbor, group)
+    for i in range(len(closest_matches)):
+        if i not in visited:
+            group = set()
+            dfs(i, group)
+            if len(group) > 1:  # Ignore elements with no closest match
+                groups[f"group_{len(groups) + 1}"] = sorted(list(group))
+    return groups
+def process_data_sub_specie(df, species):
+    """Process DNA data for a given species."""
+    # Filter data for the given species
+    df_plot = df[df['organism_name'] == species].reset_index(drop=True).copy()
+    # Calculate median sequence length and filter sequences
+    median = df_plot['seq_len'].median() * 0.8
+    df_plot['min_distance'] = median
+    df_plot = df_plot[df_plot['seq_len'] > median].reset_index(drop=True)
+    # Create DNA representations
+    df_plot['two_d'] = df_plot.apply(create_dna_representation_for_subs, axis=1)
+    values = np.array(df_plot['two_d'].tolist())
+    # Calculate differences between sequences
+    n_rows = values.shape[0]
+    b_list = []
+    for i in tqdm(range(n_rows)):
+        diff = np.abs(values[i:i+1, :] - values).sum(axis=1)
+        b_list.append(diff)
+    bbbb = np.array(b_list)
+    print(bbbb)
+    np.fill_diagonal(bbbb, 10000)
+    median_filter = median * 3
+    maxxx = [np.where(bbbb[i] < median_filter)[0] for i in range(len(bbbb))]
+    # Create groups
+    groups = create_groups_subs(maxxx)
+    # Add group information to dataframe
+    df_plot['group'] = 'No Group'
+    for group_name, group_indices in groups.items():
+        df_plot.loc[group_indices, 'group'] = group_name
+    # Create 2D representations
+    df_plot['two_d'] = df_plot['seq'].apply(create_dna_representation_ew_subs)
+    return df_plot
 ############################################################# ColorSquare ########################################################