cassiebuhler commited on
Commit
6d6db63
·
1 Parent(s): bbe8f2e

filtering by sub_ids because I want to do habitat type queries

Browse files
Files changed (4) hide show
  1. app/app.py +6 -6
  2. app/system_prompt.txt +12 -12
  3. app/utils.py +2 -2
  4. app/variables.py +2 -2
app/app.py CHANGED
@@ -95,7 +95,7 @@ def main():
95
  sql_query = output.sql_query
96
  explanation =output.explanation
97
  if not sql_query: # if the chatbot can't generate a SQL query.
98
- return pd.DataFrame({'id' : []}),'', explanation
99
  result = ca.sql(sql_query).execute()
100
  if result.empty:
101
  explanation = "This query did not return any results. Please try again with a different query."
@@ -139,7 +139,7 @@ def main():
139
  st.stop()
140
 
141
  # output without mapping columns (id, geom)
142
- elif "id" not in llm_output.columns and "geom" not in llm_output.columns:
143
  st.write(llm_output)
144
  not_mapping = True
145
 
@@ -151,8 +151,8 @@ def main():
151
  st.code(sql_query,language = "sql")
152
 
153
  # extract ids, columns, bounds if present
154
- if "id" in llm_output.columns and not llm_output.empty:
155
- ids = list(set(llm_output['id'].tolist()))
156
  llm_cols = extract_columns(sql_query)
157
  bounds = llm_output.total_bounds.tolist()
158
  else:
@@ -390,12 +390,12 @@ def main():
390
  if ('geom' in llm_output.columns) and (not llm_output.empty):
391
  llm_output = llm_output.drop('geom',axis = 1)
392
  if not llm_output.empty:
393
- if 'name' in llm_output.columns and 'id' in llm_output.columns:
394
  llm_grouped = (llm_output.groupby('name')
395
  .agg({col: ('sum' if col == 'acres' else 'first')
396
  for col in llm_output.columns
397
  if col != 'name'})).reset_index()
398
- llm_grouped.drop(['id'], axis=1, inplace = True)
399
  st.dataframe(llm_grouped, use_container_width = True)
400
  else:
401
  st.dataframe(llm_output, use_container_width = True)
 
95
  sql_query = output.sql_query
96
  explanation =output.explanation
97
  if not sql_query: # if the chatbot can't generate a SQL query.
98
+ return pd.DataFrame({'sub_id' : []}),'', explanation
99
  result = ca.sql(sql_query).execute()
100
  if result.empty:
101
  explanation = "This query did not return any results. Please try again with a different query."
 
139
  st.stop()
140
 
141
  # output without mapping columns (id, geom)
142
+ elif "sub_id" not in llm_output.columns and "geom" not in llm_output.columns:
143
  st.write(llm_output)
144
  not_mapping = True
145
 
 
151
  st.code(sql_query,language = "sql")
152
 
153
  # extract ids, columns, bounds if present
154
+ if "sub_id" in llm_output.columns and not llm_output.empty:
155
+ ids = list(set(llm_output['sub_id'].tolist()))
156
  llm_cols = extract_columns(sql_query)
157
  bounds = llm_output.total_bounds.tolist()
158
  else:
 
390
  if ('geom' in llm_output.columns) and (not llm_output.empty):
391
  llm_output = llm_output.drop('geom',axis = 1)
392
  if not llm_output.empty:
393
+ if 'name' in llm_output.columns and 'sub_id' in llm_output.columns:
394
  llm_grouped = (llm_output.groupby('name')
395
  .agg({col: ('sum' if col == 'acres' else 'first')
396
  for col in llm_output.columns
397
  if col != 'name'})).reset_index()
398
+ llm_grouped.drop(['sub_id'], axis=1, inplace = True)
399
  st.dataframe(llm_grouped, use_container_width = True)
400
  else:
401
  st.dataframe(llm_output, use_container_width = True)
app/system_prompt.txt CHANGED
@@ -32,7 +32,7 @@ Ensure the response contains only this JSON object, with no additional text, for
32
  - Unless the users asks about biodiversity overlap or coverage, use columns with the prefix "mean_" to quantify biodiversity.
33
 
34
  # Column Descriptions
35
- - "id": The parent id for sub_id. "id" identifes a protected area, while "sub_id" identifies the particular feature in a protected area. "id" is necessary for displaying queried results on a map.
36
  - "sub_id": unique string identifier for each feature.
37
  - "established": The time range which the land was acquired, either "2024" or "pre-2024".
38
  - "gap_code": The GAP code corresponds to the level of biodiversity protection for an area; GAP 1 has the highest protections whereas GAP 4 has the weakest. There are 4 gap codes and are defined as the following. In the data, non-conservation areas do not have a GAP code.
@@ -73,7 +73,7 @@ Only use the following table:
73
  ## Example:
74
  example_user: "Show me the best areas to go birdwatching in San Diego County."
75
  example_assistant: {{"sql_query":
76
- SELECT "id", "geom", "name", "acres", "county","mean_bird_richness"
77
  FROM mydata
78
  WHERE "county" = 'San Diego'
79
  ORDER BY "mean_bird_richness" DESC
@@ -90,7 +90,7 @@ limit the uses of private land to protect conservation values.
90
  ## Example:
91
  example_user: "Show me all non-profit land."
92
  example_assistant: {{"sql_query":
93
- SELECT "id", "geom", "name", "acres", "manager_type"
94
  FROM mydata
95
  WHERE "manager_type" = 'Non Profit';
96
  "explanation":"I selected all data where `manager_type` is 'Non Profit'."
@@ -99,7 +99,7 @@ example_assistant: {{"sql_query":
99
  ## Example:
100
  example_user: "Where are areas with high plant biodiversity"
101
  example_assistant: {{"sql_query":
102
- SELECT "id", "geom", "name", "acres", "mean_plant_richness"
103
  FROM mydata
104
  ORDER BY "mean_plant_richness" DESC
105
  LIMIT 50;
@@ -107,7 +107,7 @@ example_assistant: {{"sql_query":
107
 
108
  example_user: "Show me areas open to the public in disadvantaged communities."
109
  example_assistant: {{"sql_query":
110
- SELECT "id", "geom", "name", "acres", "access_type", "pct_disadvantaged_community" FROM mydata
111
  WHERE "access_type" = 'Open Access'
112
  WHERE "pct_disadvantaged_community" > 0;
113
  "explanation": "I used `access_type` to filter for areas that are open to the public (`Open Access`) and `pct_disadvantaged_community` > 0 to include only those located in disadvantaged communities. `pct_disadvantaged_community` identifies communities burdened by multiple sources of pollution with population characteristics that make them more sensitive to pollution."
@@ -141,13 +141,13 @@ sql_query:
141
  ## Example:
142
  example_user: "Show me all land managed by the United States Forest Service."
143
  sql_query:
144
- SELECT "id", "geom", "name", "acres", "manager" FROM mydata
145
  WHERE LOWER("manager") LIKE '%united states forest service%';
146
 
147
  ## Example:
148
  example_user: "Show me areas with more than 25% overlap in bird species richness."
149
  sql_query:
150
- SELECT "id", "geom", "name", "acres", "pct_bird_richness"
151
  FROM mydata
152
  AND (
153
  "pct_bird_richness" > 0.25
@@ -165,7 +165,7 @@ sql_query:
165
 
166
  example_user: "Show me GAP 3 lands where more than 50% of the area overlaps with regions of high biodiversity."
167
  example_assistant: {{"sql_query":
168
- SELECT "id", "geom", "name", "acres", "county",
169
  "pct_top_amphibian_richness", "pct_top_reptile_richness",
170
  "pct_top_bird_richness", "pct_top_mammal_richness",
171
  "pct_top_freshwater_richness", "pct_top_plant_richness"
@@ -186,21 +186,21 @@ example_assistant: {{"sql_query":
186
  SELECT PERCENTILE_CONT(0.85) WITHIN GROUP (ORDER BY "mean_amphibian_richness") AS mean_amphibian_richness_85_percentile
187
  FROM mydata
188
  )
189
- SELECT "id", "geom", "name", "acres","mean_amphibian_richness"
190
  FROM mydata
191
  WHERE "land_tenure" = 'Easement'
192
  AND "mean_amphibian_richness" >= (SELECT mean_amphibian_richness_85_percentile FROM percentile);
193
 
194
  example_user: "Show nonconserved areas in climate zone 2"
195
  example_assistant: {{"sql_query":
196
- SELECT "id", "geom", "name", "acres", "status", "climate_zone"
197
  FROM mydata
198
  WHERE "climate_zone" = 'Zone 2'
199
  AND "status" = 'Non-Conservation Area';
200
 
201
  example_user: "Show me working lands that 30x30 conservation areas"
202
  example_assistant: {{"sql_query":
203
- SELECT "id", "geom", "name", "acres", "land_tenure", "pct_farmland"
204
  FROM mydata
205
  WHERE "status" = '30x30 Conservation Area'
206
  AND "pct_farmland" >0;
@@ -211,7 +211,7 @@ example_assistant: {{"sql_query":
211
  SELECT PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "mean_amphibian_richness") AS mean_mammal_richness_95_percentile
212
  FROM mydata
213
  )
214
- SELECT "id", "geom", "name", "acres","mean_mammal_richness"
215
  FROM mydata
216
  WHERE "status" != '30x30 Conservation Area'
217
  AND "mean_mammal_richness" >= (SELECT mean_amphibian_richness_95_percentile FROM percentile);
 
32
  - Unless the users asks about biodiversity overlap or coverage, use columns with the prefix "mean_" to quantify biodiversity.
33
 
34
  # Column Descriptions
35
+ - "id": The parent id for sub_id. "id" identifes a protected area, while "sub_id" identifies the particular feature in a protected area. "sub_id" is necessary for displaying queried results on a map.
36
  - "sub_id": unique string identifier for each feature.
37
  - "established": The time range which the land was acquired, either "2024" or "pre-2024".
38
  - "gap_code": The GAP code corresponds to the level of biodiversity protection for an area; GAP 1 has the highest protections whereas GAP 4 has the weakest. There are 4 gap codes and are defined as the following. In the data, non-conservation areas do not have a GAP code.
 
73
  ## Example:
74
  example_user: "Show me the best areas to go birdwatching in San Diego County."
75
  example_assistant: {{"sql_query":
76
+ SELECT "sub_id", "geom", "name", "acres", "county","mean_bird_richness"
77
  FROM mydata
78
  WHERE "county" = 'San Diego'
79
  ORDER BY "mean_bird_richness" DESC
 
90
  ## Example:
91
  example_user: "Show me all non-profit land."
92
  example_assistant: {{"sql_query":
93
+ SELECT "sub_id", "geom", "name", "acres", "manager_type"
94
  FROM mydata
95
  WHERE "manager_type" = 'Non Profit';
96
  "explanation":"I selected all data where `manager_type` is 'Non Profit'."
 
99
  ## Example:
100
  example_user: "Where are areas with high plant biodiversity"
101
  example_assistant: {{"sql_query":
102
+ SELECT "sub_id", "geom", "name", "acres", "mean_plant_richness"
103
  FROM mydata
104
  ORDER BY "mean_plant_richness" DESC
105
  LIMIT 50;
 
107
 
108
  example_user: "Show me areas open to the public in disadvantaged communities."
109
  example_assistant: {{"sql_query":
110
+ SELECT "sub_id", "geom", "name", "acres", "access_type", "pct_disadvantaged_community" FROM mydata
111
  WHERE "access_type" = 'Open Access'
112
  WHERE "pct_disadvantaged_community" > 0;
113
  "explanation": "I used `access_type` to filter for areas that are open to the public (`Open Access`) and `pct_disadvantaged_community` > 0 to include only those located in disadvantaged communities. `pct_disadvantaged_community` identifies communities burdened by multiple sources of pollution with population characteristics that make them more sensitive to pollution."
 
141
  ## Example:
142
  example_user: "Show me all land managed by the United States Forest Service."
143
  sql_query:
144
+ SELECT "sub_id", "geom", "name", "acres", "manager" FROM mydata
145
  WHERE LOWER("manager") LIKE '%united states forest service%';
146
 
147
  ## Example:
148
  example_user: "Show me areas with more than 25% overlap in bird species richness."
149
  sql_query:
150
+ SELECT "sub_id", "geom", "name", "acres", "pct_bird_richness"
151
  FROM mydata
152
  AND (
153
  "pct_bird_richness" > 0.25
 
165
 
166
  example_user: "Show me GAP 3 lands where more than 50% of the area overlaps with regions of high biodiversity."
167
  example_assistant: {{"sql_query":
168
+ SELECT "sub_id", "geom", "name", "acres", "county",
169
  "pct_top_amphibian_richness", "pct_top_reptile_richness",
170
  "pct_top_bird_richness", "pct_top_mammal_richness",
171
  "pct_top_freshwater_richness", "pct_top_plant_richness"
 
186
  SELECT PERCENTILE_CONT(0.85) WITHIN GROUP (ORDER BY "mean_amphibian_richness") AS mean_amphibian_richness_85_percentile
187
  FROM mydata
188
  )
189
+ SELECT "sub_id", "geom", "name", "acres","mean_amphibian_richness"
190
  FROM mydata
191
  WHERE "land_tenure" = 'Easement'
192
  AND "mean_amphibian_richness" >= (SELECT mean_amphibian_richness_85_percentile FROM percentile);
193
 
194
  example_user: "Show nonconserved areas in climate zone 2"
195
  example_assistant: {{"sql_query":
196
+ SELECT "sub_id", "geom", "name", "acres", "status", "climate_zone"
197
  FROM mydata
198
  WHERE "climate_zone" = 'Zone 2'
199
  AND "status" = 'Non-Conservation Area';
200
 
201
  example_user: "Show me working lands that 30x30 conservation areas"
202
  example_assistant: {{"sql_query":
203
+ SELECT "sub_id", "geom", "name", "acres", "land_tenure", "pct_farmland"
204
  FROM mydata
205
  WHERE "status" = '30x30 Conservation Area'
206
  AND "pct_farmland" >0;
 
211
  SELECT PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "mean_amphibian_richness") AS mean_mammal_richness_95_percentile
212
  FROM mydata
213
  )
214
+ SELECT "sub_id", "geom", "name", "acres","mean_mammal_richness"
215
  FROM mydata
216
  WHERE "status" != '30x30 Conservation Area'
217
  AND "mean_mammal_richness" >= (SELECT mean_amphibian_richness_95_percentile FROM percentile);
app/utils.py CHANGED
@@ -184,7 +184,7 @@ def get_summary_table_sql(ca, column, colors, ids, feature_col = None):
184
  """
185
  Generates a summary table using specific IDs as filters.
186
  """
187
- combined_filter = _.id.isin(ids)
188
  df_network = get_summary(ca, combined_filter, [column], column, feature_col, colors)
189
  df_feature = get_summary(ca, combined_filter, [column], column, feature_col, colors, feature = True)
190
  return df_network, df_feature
@@ -206,7 +206,7 @@ def get_pmtiles_style(paint, pmtiles_file, low_res, filter_cols=None, filter_val
206
  Generates a MapLibre GL style for PMTiles with either filters or a list of IDs.
207
  """
208
  if ids:
209
- filter_expr = ["in", ["get", "id"], ["literal", ids]]
210
  else:
211
  # we don't want to overwhelm streamlit so if they didn't filter anything, don't provide filter arg
212
  filter_length = sum([len(x) for x in filter_vals])
 
184
  """
185
  Generates a summary table using specific IDs as filters.
186
  """
187
+ combined_filter = _.sub_id.isin(ids)
188
  df_network = get_summary(ca, combined_filter, [column], column, feature_col, colors)
189
  df_feature = get_summary(ca, combined_filter, [column], column, feature_col, colors, feature = True)
190
  return df_network, df_feature
 
206
  Generates a MapLibre GL style for PMTiles with either filters or a list of IDs.
207
  """
208
  if ids:
209
+ filter_expr = ["in", ["get", "sub_id"], ["literal", ids]]
210
  else:
211
  # we don't want to overwhelm streamlit so if they didn't filter anything, don't provide filter arg
212
  filter_length = sum([len(x) for x in filter_vals])
app/variables.py CHANGED
@@ -463,8 +463,8 @@ help_message = '''
463
 
464
  example_queries = """
465
  Mapping queries:
466
- - Show me bird biodiversity hotspots not covered by the 30x30 network.
467
- - Show me GAP 3 lands with mean amphibian richness in the top 10%.
468
  - Show me easements with 60% or more overlap with high plant biodiversity regions.
469
  - Show me protected areas that are open to the public in disadvantaged communities.
470
  - Show me all 30x30 conservation areas managed by The Nature Conservancy.
 
463
 
464
  example_queries = """
465
  Mapping queries:
466
+ - Show me amphibian biodiversity hotspots not covered by the 30x30 network.
467
+ - Show me GAP 3 lands with mean bird richness in the top 10%.
468
  - Show me easements with 60% or more overlap with high plant biodiversity regions.
469
  - Show me protected areas that are open to the public in disadvantaged communities.
470
  - Show me all 30x30 conservation areas managed by The Nature Conservancy.