Spaces:
Sleeping
Sleeping
Added app structure
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +92 -0
- LICENSE +10 -0
- Makefile +144 -0
- __init__.py +0 -0
- config/data-config.yaml +24 -0
- config/model-config.yaml +13 -0
- config/model-parameters.yaml +32 -0
- docs/Makefile +153 -0
- docs/commands.rst +10 -0
- docs/conf.py +244 -0
- docs/getting-started.rst +6 -0
- docs/index.rst +24 -0
- docs/make.bat +190 -0
- models/.gitkeep +0 -0
- notebooks/.gitkeep +0 -0
- notebooks/audio_conversational_agent.ipynb +770 -0
- notebooks/basic_system.ipynb +377 -0
- notebooks/essay_grading_01.ipynb +204 -0
- notebooks/flagged/log.csv +2 -0
- pipeline.py +123 -0
- references/.gitkeep +0 -0
- reports/.gitkeep +0 -0
- reports/figures/.gitkeep +0 -0
- requirements.txt +98 -0
- setup.py +10 -0
- src/__init__.py +0 -0
- src/chains/chain_creation.py +47 -0
- src/chains/general.py +82 -0
- src/chains/ielts.py +66 -0
- src/chains/toefl.py +130 -0
- src/data/.gitkeep +0 -0
- src/data/__init__.py +0 -0
- src/data/components/__init__.py +0 -0
- src/data/components/data_ingestion.py +40 -0
- src/data/components/data_preprocessing.py +45 -0
- src/data/components/data_transformation.py +49 -0
- src/data/configuration.py +59 -0
- src/data/entity.py +25 -0
- src/data/make_dataset.py +58 -0
- src/data/pipeline/__init__.py +0 -0
- src/data/pipeline/stage_01_data_ingestion.py +28 -0
- src/data/pipeline/stage_02_data_preprocessing.py +29 -0
- src/data/pipeline/stage_03_data_transformation.py +27 -0
- src/essay_evaluation.py +85 -0
- src/features/.gitkeep +0 -0
- src/features/__init__.py +0 -0
- src/features/build_features.py +0 -0
- src/logger/__init__.py +20 -0
- src/main.py +21 -0
- src/models/.gitkeep +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
|
| 5 |
+
# C extensions
|
| 6 |
+
*.so
|
| 7 |
+
|
| 8 |
+
# Distribution / packaging
|
| 9 |
+
.Python
|
| 10 |
+
env/
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# PyInstaller
|
| 27 |
+
# Usually these files are written by a python script from a template
|
| 28 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 29 |
+
*.manifest
|
| 30 |
+
*.spec
|
| 31 |
+
|
| 32 |
+
# Installer logs
|
| 33 |
+
pip-log.txt
|
| 34 |
+
pip-delete-this-directory.txt
|
| 35 |
+
|
| 36 |
+
# Unit test / coverage reports
|
| 37 |
+
htmlcov/
|
| 38 |
+
.tox/
|
| 39 |
+
.coverage
|
| 40 |
+
.coverage.*
|
| 41 |
+
.cache
|
| 42 |
+
nosetests.xml
|
| 43 |
+
coverage.xml
|
| 44 |
+
*.cover
|
| 45 |
+
|
| 46 |
+
# Translations
|
| 47 |
+
*.mo
|
| 48 |
+
*.pot
|
| 49 |
+
|
| 50 |
+
# Django stuff:
|
| 51 |
+
*.log
|
| 52 |
+
|
| 53 |
+
# Sphinx documentation
|
| 54 |
+
docs/_build/
|
| 55 |
+
|
| 56 |
+
# PyBuilder
|
| 57 |
+
target/
|
| 58 |
+
|
| 59 |
+
# DotEnv configuration
|
| 60 |
+
.env
|
| 61 |
+
|
| 62 |
+
# Database
|
| 63 |
+
*.db
|
| 64 |
+
*.rdb
|
| 65 |
+
|
| 66 |
+
# Pycharm
|
| 67 |
+
.idea
|
| 68 |
+
|
| 69 |
+
# VS Code
|
| 70 |
+
.vscode/
|
| 71 |
+
|
| 72 |
+
# Spyder
|
| 73 |
+
.spyproject/
|
| 74 |
+
|
| 75 |
+
# Jupyter NB Checkpoints
|
| 76 |
+
.ipynb_checkpoints/
|
| 77 |
+
|
| 78 |
+
# exclude data from source control by default
|
| 79 |
+
/data/
|
| 80 |
+
|
| 81 |
+
# Mac OS-specific storage files
|
| 82 |
+
.DS_Store
|
| 83 |
+
|
| 84 |
+
# vim
|
| 85 |
+
*.swp
|
| 86 |
+
*.swo
|
| 87 |
+
|
| 88 |
+
# Mypy cache
|
| 89 |
+
.mypy_cache/
|
| 90 |
+
|
| 91 |
+
# Environment
|
| 92 |
+
.venv/
|
LICENSE
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
The MIT License (MIT)
|
| 3 |
+
Copyright (c) 2024, Aleksandr Shishkov
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
| 6 |
+
|
| 7 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
| 8 |
+
|
| 9 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
| 10 |
+
|
Makefile
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
|
| 2 |
+
|
| 3 |
+
#################################################################################
|
| 4 |
+
# GLOBALS #
|
| 5 |
+
#################################################################################
|
| 6 |
+
|
| 7 |
+
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
|
| 8 |
+
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
|
| 9 |
+
PROFILE = default
|
| 10 |
+
PROJECT_NAME = deep-essay
|
| 11 |
+
PYTHON_INTERPRETER = python3
|
| 12 |
+
|
| 13 |
+
ifeq (,$(shell which conda))
|
| 14 |
+
HAS_CONDA=False
|
| 15 |
+
else
|
| 16 |
+
HAS_CONDA=True
|
| 17 |
+
endif
|
| 18 |
+
|
| 19 |
+
#################################################################################
|
| 20 |
+
# COMMANDS #
|
| 21 |
+
#################################################################################
|
| 22 |
+
|
| 23 |
+
## Install Python Dependencies
|
| 24 |
+
requirements: test_environment
|
| 25 |
+
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
|
| 26 |
+
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
|
| 27 |
+
|
| 28 |
+
## Make Dataset
|
| 29 |
+
data: requirements
|
| 30 |
+
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
|
| 31 |
+
|
| 32 |
+
## Delete all compiled Python files
|
| 33 |
+
clean:
|
| 34 |
+
find . -type f -name "*.py[co]" -delete
|
| 35 |
+
find . -type d -name "__pycache__" -delete
|
| 36 |
+
|
| 37 |
+
## Lint using flake8
|
| 38 |
+
lint:
|
| 39 |
+
flake8 src
|
| 40 |
+
|
| 41 |
+
## Upload Data to S3
|
| 42 |
+
sync_data_to_s3:
|
| 43 |
+
ifeq (default,$(PROFILE))
|
| 44 |
+
aws s3 sync data/ s3://$(BUCKET)/data/
|
| 45 |
+
else
|
| 46 |
+
aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
|
| 47 |
+
endif
|
| 48 |
+
|
| 49 |
+
## Download Data from S3
|
| 50 |
+
sync_data_from_s3:
|
| 51 |
+
ifeq (default,$(PROFILE))
|
| 52 |
+
aws s3 sync s3://$(BUCKET)/data/ data/
|
| 53 |
+
else
|
| 54 |
+
aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
|
| 55 |
+
endif
|
| 56 |
+
|
| 57 |
+
## Set up python interpreter environment
|
| 58 |
+
create_environment:
|
| 59 |
+
ifeq (True,$(HAS_CONDA))
|
| 60 |
+
@echo ">>> Detected conda, creating conda environment."
|
| 61 |
+
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
|
| 62 |
+
conda create --name $(PROJECT_NAME) python=3
|
| 63 |
+
else
|
| 64 |
+
conda create --name $(PROJECT_NAME) python=2.7
|
| 65 |
+
endif
|
| 66 |
+
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
|
| 67 |
+
else
|
| 68 |
+
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
|
| 69 |
+
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
|
| 70 |
+
export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
|
| 71 |
+
@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
|
| 72 |
+
@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
|
| 73 |
+
endif
|
| 74 |
+
|
| 75 |
+
## Test python environment is setup correctly
|
| 76 |
+
test_environment:
|
| 77 |
+
$(PYTHON_INTERPRETER) test_environment.py
|
| 78 |
+
|
| 79 |
+
#################################################################################
|
| 80 |
+
# PROJECT RULES #
|
| 81 |
+
#################################################################################
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
#################################################################################
|
| 86 |
+
# Self Documenting Commands #
|
| 87 |
+
#################################################################################
|
| 88 |
+
|
| 89 |
+
.DEFAULT_GOAL := help
|
| 90 |
+
|
| 91 |
+
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
|
| 92 |
+
# sed script explained:
|
| 93 |
+
# /^##/:
|
| 94 |
+
# * save line in hold space
|
| 95 |
+
# * purge line
|
| 96 |
+
# * Loop:
|
| 97 |
+
# * append newline + line to hold space
|
| 98 |
+
# * go to next line
|
| 99 |
+
# * if line starts with doc comment, strip comment character off and loop
|
| 100 |
+
# * remove target prerequisites
|
| 101 |
+
# * append hold space (+ newline) to line
|
| 102 |
+
# * replace newline plus comments by `---`
|
| 103 |
+
# * print line
|
| 104 |
+
# Separate expressions are necessary because labels cannot be delimited by
|
| 105 |
+
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
|
| 106 |
+
.PHONY: help
|
| 107 |
+
help:
|
| 108 |
+
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
|
| 109 |
+
@echo
|
| 110 |
+
@sed -n -e "/^## / { \
|
| 111 |
+
h; \
|
| 112 |
+
s/.*//; \
|
| 113 |
+
:doc" \
|
| 114 |
+
-e "H; \
|
| 115 |
+
n; \
|
| 116 |
+
s/^## //; \
|
| 117 |
+
t doc" \
|
| 118 |
+
-e "s/:.*//; \
|
| 119 |
+
G; \
|
| 120 |
+
s/\\n## /---/; \
|
| 121 |
+
s/\\n/ /g; \
|
| 122 |
+
p; \
|
| 123 |
+
}" ${MAKEFILE_LIST} \
|
| 124 |
+
| LC_ALL='C' sort --ignore-case \
|
| 125 |
+
| awk -F '---' \
|
| 126 |
+
-v ncol=$$(tput cols) \
|
| 127 |
+
-v indent=19 \
|
| 128 |
+
-v col_on="$$(tput setaf 6)" \
|
| 129 |
+
-v col_off="$$(tput sgr0)" \
|
| 130 |
+
'{ \
|
| 131 |
+
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
|
| 132 |
+
n = split($$2, words, " "); \
|
| 133 |
+
line_length = ncol - indent; \
|
| 134 |
+
for (i = 1; i <= n; i++) { \
|
| 135 |
+
line_length -= length(words[i]) + 1; \
|
| 136 |
+
if (line_length <= 0) { \
|
| 137 |
+
line_length = ncol - indent - length(words[i]) - 1; \
|
| 138 |
+
printf "\n%*s ", -indent, " "; \
|
| 139 |
+
} \
|
| 140 |
+
printf "%s ", words[i]; \
|
| 141 |
+
} \
|
| 142 |
+
printf "\n"; \
|
| 143 |
+
}' \
|
| 144 |
+
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
|
__init__.py
ADDED
|
File without changes
|
config/data-config.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_ingestion:
|
| 2 |
+
root_dir: data/raw
|
| 3 |
+
hf_dataset_name: qwedsacf/grade-school-math-instructions
|
| 4 |
+
hf_dataset_split: train
|
| 5 |
+
local_data_file: data/raw/data.zip
|
| 6 |
+
unzip_dir: data/raw
|
| 7 |
+
|
| 8 |
+
data_preprocessing:
|
| 9 |
+
root_dir: data/interim
|
| 10 |
+
raw_data_path: data/raw/raw_dataset.csv
|
| 11 |
+
question_key: INSTRUCTION
|
| 12 |
+
answer_key: RESPONSE
|
| 13 |
+
|
| 14 |
+
data_transformation:
|
| 15 |
+
root_dir: data/processed
|
| 16 |
+
finetuning_data_path: data/interim/finetuning_dataset.csv
|
| 17 |
+
train_data_split: 0.9
|
| 18 |
+
test_data_split: 0.05
|
| 19 |
+
eval_data_split: 0.05
|
| 20 |
+
|
| 21 |
+
data_validation:
|
| 22 |
+
root_dir: data/processed
|
| 23 |
+
STATUS_FILE: data/processed/status.txt
|
| 24 |
+
ALL_REQUIRED_FILES: ["train_dataset.csv", "test_dataset.csv", "eval_dataset.csv"]
|
config/model-config.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_trainer:
|
| 2 |
+
root_dir: models
|
| 3 |
+
data_path: data/processed
|
| 4 |
+
base_model: openlm-research/open_llama_3b_v2
|
| 5 |
+
training_name: 01_QLoRA
|
| 6 |
+
upload_from_hf: True
|
| 7 |
+
hf_model_name: Logisx/open_llama_3b_v2-Fine-Tuned-Grade_School_Math_Instructions
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
model_prediction:
|
| 11 |
+
data_path: data/processed
|
| 12 |
+
base_model: openlm-research/open_llama_3b_v2
|
| 13 |
+
adapters_path: models/01_QLoRA
|
config/model-parameters.yaml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
lora_parameters:
|
| 2 |
+
r: 16
|
| 3 |
+
target_modules: ["q_proj", "v_proj"]
|
| 4 |
+
lora_alpha: 8.0
|
| 5 |
+
lora_dropout: 0.05
|
| 6 |
+
bias: none
|
| 7 |
+
task_type: CAUSAL_LM
|
| 8 |
+
|
| 9 |
+
bits_and_bytes_parameters:
|
| 10 |
+
load_in_4bit: true
|
| 11 |
+
bnb_4bit_quant_type: nf4
|
| 12 |
+
bnb_4bit_use_double_quant: True
|
| 13 |
+
|
| 14 |
+
training_arguments:
|
| 15 |
+
output_dir: outputs
|
| 16 |
+
evaluation_strategy: epoch
|
| 17 |
+
save_strategy: epoch
|
| 18 |
+
num_train_epochs: 1.0
|
| 19 |
+
per_device_train_batch_size: 4
|
| 20 |
+
gradient_accumulation_steps: 4
|
| 21 |
+
optim: adamw_hf
|
| 22 |
+
learning_rate: 1e-5
|
| 23 |
+
fp16: True
|
| 24 |
+
max_grad_norm: 0.3
|
| 25 |
+
warmup_ratio: 0.03
|
| 26 |
+
group_by_length: True
|
| 27 |
+
lr_scheduler_type: linear
|
| 28 |
+
|
| 29 |
+
prediction_parameters:
|
| 30 |
+
length_penalty: 0.8
|
| 31 |
+
num_beams: 8
|
| 32 |
+
max_length: 128
|
docs/Makefile
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Makefile for Sphinx documentation
|
| 2 |
+
#
|
| 3 |
+
|
| 4 |
+
# You can set these variables from the command line.
|
| 5 |
+
SPHINXOPTS =
|
| 6 |
+
SPHINXBUILD = sphinx-build
|
| 7 |
+
PAPER =
|
| 8 |
+
BUILDDIR = _build
|
| 9 |
+
|
| 10 |
+
# Internal variables.
|
| 11 |
+
PAPEROPT_a4 = -D latex_paper_size=a4
|
| 12 |
+
PAPEROPT_letter = -D latex_paper_size=letter
|
| 13 |
+
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
| 14 |
+
# the i18n builder cannot share the environment and doctrees with the others
|
| 15 |
+
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
| 16 |
+
|
| 17 |
+
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
| 18 |
+
|
| 19 |
+
help:
|
| 20 |
+
@echo "Please use \`make <target>' where <target> is one of"
|
| 21 |
+
@echo " html to make standalone HTML files"
|
| 22 |
+
@echo " dirhtml to make HTML files named index.html in directories"
|
| 23 |
+
@echo " singlehtml to make a single large HTML file"
|
| 24 |
+
@echo " pickle to make pickle files"
|
| 25 |
+
@echo " json to make JSON files"
|
| 26 |
+
@echo " htmlhelp to make HTML files and a HTML help project"
|
| 27 |
+
@echo " qthelp to make HTML files and a qthelp project"
|
| 28 |
+
@echo " devhelp to make HTML files and a Devhelp project"
|
| 29 |
+
@echo " epub to make an epub"
|
| 30 |
+
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
| 31 |
+
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
| 32 |
+
@echo " text to make text files"
|
| 33 |
+
@echo " man to make manual pages"
|
| 34 |
+
@echo " texinfo to make Texinfo files"
|
| 35 |
+
@echo " info to make Texinfo files and run them through makeinfo"
|
| 36 |
+
@echo " gettext to make PO message catalogs"
|
| 37 |
+
@echo " changes to make an overview of all changed/added/deprecated items"
|
| 38 |
+
@echo " linkcheck to check all external links for integrity"
|
| 39 |
+
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
| 40 |
+
|
| 41 |
+
clean:
|
| 42 |
+
-rm -rf $(BUILDDIR)/*
|
| 43 |
+
|
| 44 |
+
html:
|
| 45 |
+
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
| 46 |
+
@echo
|
| 47 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
| 48 |
+
|
| 49 |
+
dirhtml:
|
| 50 |
+
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
| 51 |
+
@echo
|
| 52 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
| 53 |
+
|
| 54 |
+
singlehtml:
|
| 55 |
+
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
| 56 |
+
@echo
|
| 57 |
+
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
| 58 |
+
|
| 59 |
+
pickle:
|
| 60 |
+
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
| 61 |
+
@echo
|
| 62 |
+
@echo "Build finished; now you can process the pickle files."
|
| 63 |
+
|
| 64 |
+
json:
|
| 65 |
+
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
| 66 |
+
@echo
|
| 67 |
+
@echo "Build finished; now you can process the JSON files."
|
| 68 |
+
|
| 69 |
+
htmlhelp:
|
| 70 |
+
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
| 71 |
+
@echo
|
| 72 |
+
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
| 73 |
+
".hhp project file in $(BUILDDIR)/htmlhelp."
|
| 74 |
+
|
| 75 |
+
qthelp:
|
| 76 |
+
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
| 77 |
+
@echo
|
| 78 |
+
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
| 79 |
+
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
| 80 |
+
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/deep-essay.qhcp"
|
| 81 |
+
@echo "To view the help file:"
|
| 82 |
+
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/deep-essay.qhc"
|
| 83 |
+
|
| 84 |
+
devhelp:
|
| 85 |
+
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
| 86 |
+
@echo
|
| 87 |
+
@echo "Build finished."
|
| 88 |
+
@echo "To view the help file:"
|
| 89 |
+
@echo "# mkdir -p $$HOME/.local/share/devhelp/deep-essay"
|
| 90 |
+
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/deep-essay"
|
| 91 |
+
@echo "# devhelp"
|
| 92 |
+
|
| 93 |
+
epub:
|
| 94 |
+
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
| 95 |
+
@echo
|
| 96 |
+
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
| 97 |
+
|
| 98 |
+
latex:
|
| 99 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
| 100 |
+
@echo
|
| 101 |
+
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
| 102 |
+
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
| 103 |
+
"(use \`make latexpdf' here to do that automatically)."
|
| 104 |
+
|
| 105 |
+
latexpdf:
|
| 106 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
| 107 |
+
@echo "Running LaTeX files through pdflatex..."
|
| 108 |
+
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
| 109 |
+
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
| 110 |
+
|
| 111 |
+
text:
|
| 112 |
+
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
| 113 |
+
@echo
|
| 114 |
+
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
| 115 |
+
|
| 116 |
+
man:
|
| 117 |
+
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
| 118 |
+
@echo
|
| 119 |
+
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
| 120 |
+
|
| 121 |
+
texinfo:
|
| 122 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
| 123 |
+
@echo
|
| 124 |
+
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
| 125 |
+
@echo "Run \`make' in that directory to run these through makeinfo" \
|
| 126 |
+
"(use \`make info' here to do that automatically)."
|
| 127 |
+
|
| 128 |
+
info:
|
| 129 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
| 130 |
+
@echo "Running Texinfo files through makeinfo..."
|
| 131 |
+
make -C $(BUILDDIR)/texinfo info
|
| 132 |
+
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
| 133 |
+
|
| 134 |
+
gettext:
|
| 135 |
+
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
| 136 |
+
@echo
|
| 137 |
+
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
| 138 |
+
|
| 139 |
+
changes:
|
| 140 |
+
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
| 141 |
+
@echo
|
| 142 |
+
@echo "The overview file is in $(BUILDDIR)/changes."
|
| 143 |
+
|
| 144 |
+
linkcheck:
|
| 145 |
+
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
| 146 |
+
@echo
|
| 147 |
+
@echo "Link check complete; look for any errors in the above output " \
|
| 148 |
+
"or in $(BUILDDIR)/linkcheck/output.txt."
|
| 149 |
+
|
| 150 |
+
doctest:
|
| 151 |
+
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
| 152 |
+
@echo "Testing of doctests in the sources finished, look at the " \
|
| 153 |
+
"results in $(BUILDDIR)/doctest/output.txt."
|
docs/commands.rst
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Commands
|
| 2 |
+
========
|
| 3 |
+
|
| 4 |
+
The Makefile contains the central entry points for common tasks related to this project.
|
| 5 |
+
|
| 6 |
+
Syncing data to S3
|
| 7 |
+
^^^^^^^^^^^^^^^^^^
|
| 8 |
+
|
| 9 |
+
* `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`.
|
| 10 |
+
* `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`.
|
docs/conf.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
#
|
| 3 |
+
# DeepEssay documentation build configuration file, created by
|
| 4 |
+
# sphinx-quickstart.
|
| 5 |
+
#
|
| 6 |
+
# This file is execfile()d with the current directory set to its containing dir.
|
| 7 |
+
#
|
| 8 |
+
# Note that not all possible configuration values are present in this
|
| 9 |
+
# autogenerated file.
|
| 10 |
+
#
|
| 11 |
+
# All configuration values have a default; values that are commented out
|
| 12 |
+
# serve to show the default.
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
| 18 |
+
# add these directories to sys.path here. If the directory is relative to the
|
| 19 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
| 20 |
+
# sys.path.insert(0, os.path.abspath('.'))
|
| 21 |
+
|
| 22 |
+
# -- General configuration -----------------------------------------------------
|
| 23 |
+
|
| 24 |
+
# If your documentation needs a minimal Sphinx version, state it here.
|
| 25 |
+
# needs_sphinx = '1.0'
|
| 26 |
+
|
| 27 |
+
# Add any Sphinx extension module names here, as strings. They can be extensions
|
| 28 |
+
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
| 29 |
+
extensions = []
|
| 30 |
+
|
| 31 |
+
# Add any paths that contain templates here, relative to this directory.
|
| 32 |
+
templates_path = ['_templates']
|
| 33 |
+
|
| 34 |
+
# The suffix of source filenames.
|
| 35 |
+
source_suffix = '.rst'
|
| 36 |
+
|
| 37 |
+
# The encoding of source files.
|
| 38 |
+
# source_encoding = 'utf-8-sig'
|
| 39 |
+
|
| 40 |
+
# The master toctree document.
|
| 41 |
+
master_doc = 'index'
|
| 42 |
+
|
| 43 |
+
# General information about the project.
|
| 44 |
+
project = u'DeepEssay'
|
| 45 |
+
|
| 46 |
+
# The version info for the project you're documenting, acts as replacement for
|
| 47 |
+
# |version| and |release|, also used in various other places throughout the
|
| 48 |
+
# built documents.
|
| 49 |
+
#
|
| 50 |
+
# The short X.Y version.
|
| 51 |
+
version = '0.1'
|
| 52 |
+
# The full version, including alpha/beta/rc tags.
|
| 53 |
+
release = '0.1'
|
| 54 |
+
|
| 55 |
+
# The language for content autogenerated by Sphinx. Refer to documentation
|
| 56 |
+
# for a list of supported languages.
|
| 57 |
+
# language = None
|
| 58 |
+
|
| 59 |
+
# There are two options for replacing |today|: either, you set today to some
|
| 60 |
+
# non-false value, then it is used:
|
| 61 |
+
# today = ''
|
| 62 |
+
# Else, today_fmt is used as the format for a strftime call.
|
| 63 |
+
# today_fmt = '%B %d, %Y'
|
| 64 |
+
|
| 65 |
+
# List of patterns, relative to source directory, that match files and
|
| 66 |
+
# directories to ignore when looking for source files.
|
| 67 |
+
exclude_patterns = ['_build']
|
| 68 |
+
|
| 69 |
+
# The reST default role (used for this markup: `text`) to use for all documents.
|
| 70 |
+
# default_role = None
|
| 71 |
+
|
| 72 |
+
# If true, '()' will be appended to :func: etc. cross-reference text.
|
| 73 |
+
# add_function_parentheses = True
|
| 74 |
+
|
| 75 |
+
# If true, the current module name will be prepended to all description
|
| 76 |
+
# unit titles (such as .. function::).
|
| 77 |
+
# add_module_names = True
|
| 78 |
+
|
| 79 |
+
# If true, sectionauthor and moduleauthor directives will be shown in the
|
| 80 |
+
# output. They are ignored by default.
|
| 81 |
+
# show_authors = False
|
| 82 |
+
|
| 83 |
+
# The name of the Pygments (syntax highlighting) style to use.
|
| 84 |
+
pygments_style = 'sphinx'
|
| 85 |
+
|
| 86 |
+
# A list of ignored prefixes for module index sorting.
|
| 87 |
+
# modindex_common_prefix = []
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# -- Options for HTML output ---------------------------------------------------
|
| 91 |
+
|
| 92 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
| 93 |
+
# a list of builtin themes.
|
| 94 |
+
html_theme = 'default'
|
| 95 |
+
|
| 96 |
+
# Theme options are theme-specific and customize the look and feel of a theme
|
| 97 |
+
# further. For a list of options available for each theme, see the
|
| 98 |
+
# documentation.
|
| 99 |
+
# html_theme_options = {}
|
| 100 |
+
|
| 101 |
+
# Add any paths that contain custom themes here, relative to this directory.
|
| 102 |
+
# html_theme_path = []
|
| 103 |
+
|
| 104 |
+
# The name for this set of Sphinx documents. If None, it defaults to
|
| 105 |
+
# "<project> v<release> documentation".
|
| 106 |
+
# html_title = None
|
| 107 |
+
|
| 108 |
+
# A shorter title for the navigation bar. Default is the same as html_title.
|
| 109 |
+
# html_short_title = None
|
| 110 |
+
|
| 111 |
+
# The name of an image file (relative to this directory) to place at the top
|
| 112 |
+
# of the sidebar.
|
| 113 |
+
# html_logo = None
|
| 114 |
+
|
| 115 |
+
# The name of an image file (within the static path) to use as favicon of the
|
| 116 |
+
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
| 117 |
+
# pixels large.
|
| 118 |
+
# html_favicon = None
|
| 119 |
+
|
| 120 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
| 121 |
+
# relative to this directory. They are copied after the builtin static files,
|
| 122 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
| 123 |
+
html_static_path = ['_static']
|
| 124 |
+
|
| 125 |
+
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
| 126 |
+
# using the given strftime format.
|
| 127 |
+
# html_last_updated_fmt = '%b %d, %Y'
|
| 128 |
+
|
| 129 |
+
# If true, SmartyPants will be used to convert quotes and dashes to
|
| 130 |
+
# typographically correct entities.
|
| 131 |
+
# html_use_smartypants = True
|
| 132 |
+
|
| 133 |
+
# Custom sidebar templates, maps document names to template names.
|
| 134 |
+
# html_sidebars = {}
|
| 135 |
+
|
| 136 |
+
# Additional templates that should be rendered to pages, maps page names to
|
| 137 |
+
# template names.
|
| 138 |
+
# html_additional_pages = {}
|
| 139 |
+
|
| 140 |
+
# If false, no module index is generated.
|
| 141 |
+
# html_domain_indices = True
|
| 142 |
+
|
| 143 |
+
# If false, no index is generated.
|
| 144 |
+
# html_use_index = True
|
| 145 |
+
|
| 146 |
+
# If true, the index is split into individual pages for each letter.
|
| 147 |
+
# html_split_index = False
|
| 148 |
+
|
| 149 |
+
# If true, links to the reST sources are added to the pages.
|
| 150 |
+
# html_show_sourcelink = True
|
| 151 |
+
|
| 152 |
+
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
| 153 |
+
# html_show_sphinx = True
|
| 154 |
+
|
| 155 |
+
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
| 156 |
+
# html_show_copyright = True
|
| 157 |
+
|
| 158 |
+
# If true, an OpenSearch description file will be output, and all pages will
|
| 159 |
+
# contain a <link> tag referring to it. The value of this option must be the
|
| 160 |
+
# base URL from which the finished HTML is served.
|
| 161 |
+
# html_use_opensearch = ''
|
| 162 |
+
|
| 163 |
+
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
| 164 |
+
# html_file_suffix = None
|
| 165 |
+
|
| 166 |
+
# Output file base name for HTML help builder.
|
| 167 |
+
htmlhelp_basename = 'deep-essaydoc'
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# -- Options for LaTeX output --------------------------------------------------
|
| 171 |
+
|
| 172 |
+
latex_elements = {
|
| 173 |
+
# The paper size ('letterpaper' or 'a4paper').
|
| 174 |
+
# 'papersize': 'letterpaper',
|
| 175 |
+
|
| 176 |
+
# The font size ('10pt', '11pt' or '12pt').
|
| 177 |
+
# 'pointsize': '10pt',
|
| 178 |
+
|
| 179 |
+
# Additional stuff for the LaTeX preamble.
|
| 180 |
+
# 'preamble': '',
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
# Grouping the document tree into LaTeX files. List of tuples
|
| 184 |
+
# (source start file, target name, title, author, documentclass [howto/manual]).
|
| 185 |
+
latex_documents = [
|
| 186 |
+
('index',
|
| 187 |
+
'deep-essay.tex',
|
| 188 |
+
u'DeepEssay Documentation',
|
| 189 |
+
u"Aleksandr Shishkov", 'manual'),
|
| 190 |
+
]
|
| 191 |
+
|
| 192 |
+
# The name of an image file (relative to this directory) to place at the top of
|
| 193 |
+
# the title page.
|
| 194 |
+
# latex_logo = None
|
| 195 |
+
|
| 196 |
+
# For "manual" documents, if this is true, then toplevel headings are parts,
|
| 197 |
+
# not chapters.
|
| 198 |
+
# latex_use_parts = False
|
| 199 |
+
|
| 200 |
+
# If true, show page references after internal links.
|
| 201 |
+
# latex_show_pagerefs = False
|
| 202 |
+
|
| 203 |
+
# If true, show URL addresses after external links.
|
| 204 |
+
# latex_show_urls = False
|
| 205 |
+
|
| 206 |
+
# Documents to append as an appendix to all manuals.
|
| 207 |
+
# latex_appendices = []
|
| 208 |
+
|
| 209 |
+
# If false, no module index is generated.
|
| 210 |
+
# latex_domain_indices = True
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# -- Options for manual page output --------------------------------------------
|
| 214 |
+
|
| 215 |
+
# One entry per manual page. List of tuples
|
| 216 |
+
# (source start file, name, description, authors, manual section).
|
| 217 |
+
man_pages = [
|
| 218 |
+
('index', 'deep-essay', u'DeepEssay Documentation',
|
| 219 |
+
[u"Aleksandr Shishkov"], 1)
|
| 220 |
+
]
|
| 221 |
+
|
| 222 |
+
# If true, show URL addresses after external links.
|
| 223 |
+
# man_show_urls = False
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
# -- Options for Texinfo output ------------------------------------------------
|
| 227 |
+
|
| 228 |
+
# Grouping the document tree into Texinfo files. List of tuples
|
| 229 |
+
# (source start file, target name, title, author,
|
| 230 |
+
# dir menu entry, description, category)
|
| 231 |
+
texinfo_documents = [
|
| 232 |
+
('index', 'deep-essay', u'DeepEssay Documentation',
|
| 233 |
+
u"Aleksandr Shishkov", 'DeepEssay',
|
| 234 |
+
'A short description of the project.', 'Miscellaneous'),
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
# Documents to append as an appendix to all manuals.
|
| 238 |
+
# texinfo_appendices = []
|
| 239 |
+
|
| 240 |
+
# If false, no module index is generated.
|
| 241 |
+
# texinfo_domain_indices = True
|
| 242 |
+
|
| 243 |
+
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
| 244 |
+
# texinfo_show_urls = 'footnote'
|
docs/getting-started.rst
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Getting started
|
| 2 |
+
===============
|
| 3 |
+
|
| 4 |
+
This is where you describe how to get set up on a clean install, including the
|
| 5 |
+
commands necessary to get the raw data (using the `sync_data_from_s3` command,
|
| 6 |
+
for example), and then how to make the cleaned, final data sets.
|
docs/index.rst
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. DeepEssay documentation master file, created by
|
| 2 |
+
sphinx-quickstart.
|
| 3 |
+
You can adapt this file completely to your liking, but it should at least
|
| 4 |
+
contain the root `toctree` directive.
|
| 5 |
+
|
| 6 |
+
DeepEssay documentation!
|
| 7 |
+
==============================================
|
| 8 |
+
|
| 9 |
+
Contents:
|
| 10 |
+
|
| 11 |
+
.. toctree::
|
| 12 |
+
:maxdepth: 2
|
| 13 |
+
|
| 14 |
+
getting-started
|
| 15 |
+
commands
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
Indices and tables
|
| 20 |
+
==================
|
| 21 |
+
|
| 22 |
+
* :ref:`genindex`
|
| 23 |
+
* :ref:`modindex`
|
| 24 |
+
* :ref:`search`
|
docs/make.bat
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@ECHO OFF
|
| 2 |
+
|
| 3 |
+
REM Command file for Sphinx documentation
|
| 4 |
+
|
| 5 |
+
if "%SPHINXBUILD%" == "" (
|
| 6 |
+
set SPHINXBUILD=sphinx-build
|
| 7 |
+
)
|
| 8 |
+
set BUILDDIR=_build
|
| 9 |
+
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
|
| 10 |
+
set I18NSPHINXOPTS=%SPHINXOPTS% .
|
| 11 |
+
if NOT "%PAPER%" == "" (
|
| 12 |
+
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
|
| 13 |
+
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
if "%1" == "" goto help
|
| 17 |
+
|
| 18 |
+
if "%1" == "help" (
|
| 19 |
+
:help
|
| 20 |
+
echo.Please use `make ^<target^>` where ^<target^> is one of
|
| 21 |
+
echo. html to make standalone HTML files
|
| 22 |
+
echo. dirhtml to make HTML files named index.html in directories
|
| 23 |
+
echo. singlehtml to make a single large HTML file
|
| 24 |
+
echo. pickle to make pickle files
|
| 25 |
+
echo. json to make JSON files
|
| 26 |
+
echo. htmlhelp to make HTML files and a HTML help project
|
| 27 |
+
echo. qthelp to make HTML files and a qthelp project
|
| 28 |
+
echo. devhelp to make HTML files and a Devhelp project
|
| 29 |
+
echo. epub to make an epub
|
| 30 |
+
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
|
| 31 |
+
echo. text to make text files
|
| 32 |
+
echo. man to make manual pages
|
| 33 |
+
echo. texinfo to make Texinfo files
|
| 34 |
+
echo. gettext to make PO message catalogs
|
| 35 |
+
echo. changes to make an overview over all changed/added/deprecated items
|
| 36 |
+
echo. linkcheck to check all external links for integrity
|
| 37 |
+
echo. doctest to run all doctests embedded in the documentation if enabled
|
| 38 |
+
goto end
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
if "%1" == "clean" (
|
| 42 |
+
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
|
| 43 |
+
del /q /s %BUILDDIR%\*
|
| 44 |
+
goto end
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
if "%1" == "html" (
|
| 48 |
+
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
|
| 49 |
+
if errorlevel 1 exit /b 1
|
| 50 |
+
echo.
|
| 51 |
+
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
|
| 52 |
+
goto end
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
if "%1" == "dirhtml" (
|
| 56 |
+
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
|
| 57 |
+
if errorlevel 1 exit /b 1
|
| 58 |
+
echo.
|
| 59 |
+
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
|
| 60 |
+
goto end
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
if "%1" == "singlehtml" (
|
| 64 |
+
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
|
| 65 |
+
if errorlevel 1 exit /b 1
|
| 66 |
+
echo.
|
| 67 |
+
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
|
| 68 |
+
goto end
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
if "%1" == "pickle" (
|
| 72 |
+
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
|
| 73 |
+
if errorlevel 1 exit /b 1
|
| 74 |
+
echo.
|
| 75 |
+
echo.Build finished; now you can process the pickle files.
|
| 76 |
+
goto end
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
if "%1" == "json" (
|
| 80 |
+
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
|
| 81 |
+
if errorlevel 1 exit /b 1
|
| 82 |
+
echo.
|
| 83 |
+
echo.Build finished; now you can process the JSON files.
|
| 84 |
+
goto end
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if "%1" == "htmlhelp" (
|
| 88 |
+
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
|
| 89 |
+
if errorlevel 1 exit /b 1
|
| 90 |
+
echo.
|
| 91 |
+
echo.Build finished; now you can run HTML Help Workshop with the ^
|
| 92 |
+
.hhp project file in %BUILDDIR%/htmlhelp.
|
| 93 |
+
goto end
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
if "%1" == "qthelp" (
|
| 97 |
+
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
|
| 98 |
+
if errorlevel 1 exit /b 1
|
| 99 |
+
echo.
|
| 100 |
+
echo.Build finished; now you can run "qcollectiongenerator" with the ^
|
| 101 |
+
.qhcp project file in %BUILDDIR%/qthelp, like this:
|
| 102 |
+
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\deep-essay.qhcp
|
| 103 |
+
echo.To view the help file:
|
| 104 |
+
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\deep-essay.ghc
|
| 105 |
+
goto end
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
if "%1" == "devhelp" (
|
| 109 |
+
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
|
| 110 |
+
if errorlevel 1 exit /b 1
|
| 111 |
+
echo.
|
| 112 |
+
echo.Build finished.
|
| 113 |
+
goto end
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
if "%1" == "epub" (
|
| 117 |
+
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
|
| 118 |
+
if errorlevel 1 exit /b 1
|
| 119 |
+
echo.
|
| 120 |
+
echo.Build finished. The epub file is in %BUILDDIR%/epub.
|
| 121 |
+
goto end
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if "%1" == "latex" (
|
| 125 |
+
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
|
| 126 |
+
if errorlevel 1 exit /b 1
|
| 127 |
+
echo.
|
| 128 |
+
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
|
| 129 |
+
goto end
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
if "%1" == "text" (
|
| 133 |
+
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
|
| 134 |
+
if errorlevel 1 exit /b 1
|
| 135 |
+
echo.
|
| 136 |
+
echo.Build finished. The text files are in %BUILDDIR%/text.
|
| 137 |
+
goto end
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if "%1" == "man" (
|
| 141 |
+
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
|
| 142 |
+
if errorlevel 1 exit /b 1
|
| 143 |
+
echo.
|
| 144 |
+
echo.Build finished. The manual pages are in %BUILDDIR%/man.
|
| 145 |
+
goto end
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
if "%1" == "texinfo" (
|
| 149 |
+
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
|
| 150 |
+
if errorlevel 1 exit /b 1
|
| 151 |
+
echo.
|
| 152 |
+
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
|
| 153 |
+
goto end
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if "%1" == "gettext" (
|
| 157 |
+
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
|
| 158 |
+
if errorlevel 1 exit /b 1
|
| 159 |
+
echo.
|
| 160 |
+
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
|
| 161 |
+
goto end
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if "%1" == "changes" (
|
| 165 |
+
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
|
| 166 |
+
if errorlevel 1 exit /b 1
|
| 167 |
+
echo.
|
| 168 |
+
echo.The overview file is in %BUILDDIR%/changes.
|
| 169 |
+
goto end
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
if "%1" == "linkcheck" (
|
| 173 |
+
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
|
| 174 |
+
if errorlevel 1 exit /b 1
|
| 175 |
+
echo.
|
| 176 |
+
echo.Link check complete; look for any errors in the above output ^
|
| 177 |
+
or in %BUILDDIR%/linkcheck/output.txt.
|
| 178 |
+
goto end
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
if "%1" == "doctest" (
|
| 182 |
+
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
|
| 183 |
+
if errorlevel 1 exit /b 1
|
| 184 |
+
echo.
|
| 185 |
+
echo.Testing of doctests in the sources finished, look at the ^
|
| 186 |
+
results in %BUILDDIR%/doctest/output.txt.
|
| 187 |
+
goto end
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
:end
|
models/.gitkeep
ADDED
|
File without changes
|
notebooks/.gitkeep
ADDED
|
File without changes
|
notebooks/audio_conversational_agent.ipynb
ADDED
|
@@ -0,0 +1,770 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "d9a8a1c5-c3e6-4a9e-96c0-c3cca62884f3",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Audio RAG Chatbot with Langchain and AssemblyAI"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "markdown",
|
| 13 |
+
"id": "0188c9a1-4dc5-4792-baa9-e2358a30d8d4",
|
| 14 |
+
"metadata": {},
|
| 15 |
+
"source": [
|
| 16 |
+
"In this notebook we create a chatbot with functionality to transcribe audio files and use them to answer the questions from the user. \n",
|
| 17 |
+
"The chatbot is based on langchain agent with Multi-Query retriever tool for RAG."
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"cell_type": "markdown",
|
| 22 |
+
"id": "f510f96b-d3ff-45f2-bec4-9eff274ccfc3",
|
| 23 |
+
"metadata": {},
|
| 24 |
+
"source": [
|
| 25 |
+
"# Imports"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "raw",
|
| 30 |
+
"id": "1ef3f8ca-eb80-4b5a-8934-744e43433a0b",
|
| 31 |
+
"metadata": {
|
| 32 |
+
"scrolled": true
|
| 33 |
+
},
|
| 34 |
+
"source": [
|
| 35 |
+
"!pip install \\\n",
|
| 36 |
+
" python-dotenv==1.0.0 \\\n",
|
| 37 |
+
" openai==0.28.1 \\\n",
|
| 38 |
+
" langchain==0.0.316 \\\n",
|
| 39 |
+
" assemblyai==0.19.0 \\\n",
|
| 40 |
+
" tiktoken==0.5.1 \\\n",
|
| 41 |
+
" gradio==4.5.0 \\\n",
|
| 42 |
+
" chromadb==0.4.15 "
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"execution_count": 1,
|
| 48 |
+
"id": "34564aeb-b6c0-42b0-93a4-46156f255d31",
|
| 49 |
+
"metadata": {},
|
| 50 |
+
"outputs": [
|
| 51 |
+
{
|
| 52 |
+
"name": "stderr",
|
| 53 |
+
"output_type": "stream",
|
| 54 |
+
"text": [
|
| 55 |
+
"C:\\Users\\logis\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 56 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 57 |
+
]
|
| 58 |
+
}
|
| 59 |
+
],
|
| 60 |
+
"source": [
|
| 61 |
+
"# System related imports\n",
|
| 62 |
+
"import os\n",
|
| 63 |
+
"from dotenv import load_dotenv, find_dotenv\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"# OpenAI related imports\n",
|
| 66 |
+
"import openai\n",
|
| 67 |
+
"from langchain.chat_models import ChatOpenAI\n",
|
| 68 |
+
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"# AssemblyAI and Document storage related imports\n",
|
| 71 |
+
"import tiktoken\n",
|
| 72 |
+
"import assemblyai as aai\n",
|
| 73 |
+
"from langchain.vectorstores import Chroma\n",
|
| 74 |
+
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
| 75 |
+
"from langchain.document_loaders import AssemblyAIAudioTranscriptLoader\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"# Agent imports\n",
|
| 78 |
+
"import logging\n",
|
| 79 |
+
"from typing import List\n",
|
| 80 |
+
"from pydantic import BaseModel, Field\n",
|
| 81 |
+
"from langchain.chains import LLMChain\n",
|
| 82 |
+
"from langchain.agents import AgentExecutor\n",
|
| 83 |
+
"from langchain.prompts import PromptTemplate\n",
|
| 84 |
+
"from langchain.prompts import ChatPromptTemplate\n",
|
| 85 |
+
"from langchain.prompts import MessagesPlaceholder\n",
|
| 86 |
+
"from langchain.memory import ConversationBufferMemory\n",
|
| 87 |
+
"from langchain.output_parsers import PydanticOutputParser\n",
|
| 88 |
+
"from langchain.schema.runnable import RunnablePassthrough\n",
|
| 89 |
+
"from langchain.retrievers.multi_query import MultiQueryRetriever\n",
|
| 90 |
+
"from langchain.tools.render import format_tool_to_openai_function\n",
|
| 91 |
+
"from langchain.agents.agent_toolkits import create_retriever_tool\n",
|
| 92 |
+
"from langchain.agents.format_scratchpad import format_to_openai_functions\n",
|
| 93 |
+
"from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
|
| 94 |
+
"\n",
|
| 95 |
+
"# Chat UI\n",
|
| 96 |
+
"import gradio as gr\n",
|
| 97 |
+
"from langchain.schema import AIMessage, HumanMessage"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "markdown",
|
| 102 |
+
"id": "1add021b-201f-47ad-b4e3-44ba810d02c2",
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"source": [
|
| 105 |
+
"# Enviromental variables"
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"cell_type": "markdown",
|
| 110 |
+
"id": "07106937-17ff-4a22-8d27-5c79a8a83591",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"source": [
|
| 113 |
+
"Set up the APIs keys."
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": 2,
|
| 119 |
+
"id": "3270c64b-5168-4085-b3d3-07a6ad1a2902",
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"outputs": [],
|
| 122 |
+
"source": [
|
| 123 |
+
"_ = load_dotenv(find_dotenv()) # read local .env file\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"openai.api_key = os.environ['OPENAI_API_KEY']\n",
|
| 126 |
+
"aai.settings.api_key = os.environ['ASSEMBLYAI_API_KEY']"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "markdown",
|
| 131 |
+
"id": "ce25c860-8b01-4f68-954f-1962fe17887c",
|
| 132 |
+
"metadata": {},
|
| 133 |
+
"source": [
|
| 134 |
+
"# Transribing"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"cell_type": "markdown",
|
| 139 |
+
"id": "dcaefd5c-4573-4a5b-82cb-7ccb9dbd3c5b",
|
| 140 |
+
"metadata": {},
|
| 141 |
+
"source": [
|
| 142 |
+
"Start with defining the links to audiofiles that we want to transribe and use."
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"cell_type": "code",
|
| 147 |
+
"execution_count": 3,
|
| 148 |
+
"id": "ee1c5455-c41e-4852-9af4-b640593451c0",
|
| 149 |
+
"metadata": {},
|
| 150 |
+
"outputs": [],
|
| 151 |
+
"source": [
|
| 152 |
+
"URLs = [\n",
|
| 153 |
+
" \"https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus\",\n",
|
| 154 |
+
" \"https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus\",\n",
|
| 155 |
+
"]"
|
| 156 |
+
]
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"cell_type": "markdown",
|
| 160 |
+
"id": "aa0741b1-0262-4c0f-b3d8-d188f43c4e01",
|
| 161 |
+
"metadata": {},
|
| 162 |
+
"source": [
|
| 163 |
+
"Use AssemblyAI API to aggregate the audio and get the documents."
|
| 164 |
+
]
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"cell_type": "code",
|
| 168 |
+
"execution_count": 4,
|
| 169 |
+
"id": "c9c2ed5b-e3c3-412c-aab2-973fac60f432",
|
| 170 |
+
"metadata": {},
|
| 171 |
+
"outputs": [
|
| 172 |
+
{
|
| 173 |
+
"name": "stdout",
|
| 174 |
+
"output_type": "stream",
|
| 175 |
+
"text": [
|
| 176 |
+
"Transcribing https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus\n",
|
| 177 |
+
"Transcribing https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus\n"
|
| 178 |
+
]
|
| 179 |
+
}
|
| 180 |
+
],
|
| 181 |
+
"source": [
|
| 182 |
+
"def transcribe_audio(url):\n",
|
| 183 |
+
" transcripts = []\n",
|
| 184 |
+
" for url in URLs:\n",
|
| 185 |
+
" print(f'Transcribing {url}')\n",
|
| 186 |
+
" transcripts.append(AssemblyAIAudioTranscriptLoader(file_path=url).load()[0])\n",
|
| 187 |
+
" return transcripts\n",
|
| 188 |
+
"\n",
|
| 189 |
+
"docs = transcribe_audio(URLs) # list of documents"
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"cell_type": "markdown",
|
| 194 |
+
"id": "1eb0bfab-2a27-4d60-8907-14951972b7ab",
|
| 195 |
+
"metadata": {},
|
| 196 |
+
"source": [
|
| 197 |
+
"# Vector storage"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"cell_type": "markdown",
|
| 202 |
+
"id": "354050ef-7da7-47e9-a9c6-27808b363d3b",
|
| 203 |
+
"metadata": {},
|
| 204 |
+
"source": [
|
| 205 |
+
"Split the documents to store the embedded splits in a vector store."
|
| 206 |
+
]
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"cell_type": "code",
|
| 210 |
+
"execution_count": 5,
|
| 211 |
+
"id": "0a6b312b-aa51-412e-85f4-715b227c51bc",
|
| 212 |
+
"metadata": {},
|
| 213 |
+
"outputs": [],
|
| 214 |
+
"source": [
|
| 215 |
+
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n",
|
| 216 |
+
"splits = text_splitter.split_documents(docs)"
|
| 217 |
+
]
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"cell_type": "code",
|
| 221 |
+
"execution_count": 6,
|
| 222 |
+
"id": "fdcb5d39-5a44-4663-bd29-175f0fd52283",
|
| 223 |
+
"metadata": {},
|
| 224 |
+
"outputs": [],
|
| 225 |
+
"source": [
|
| 226 |
+
"# modify metadata because some AssemblyAI returned metadata is not in a compatible form for the Chroma db\n",
|
| 227 |
+
"for split in splits:\n",
|
| 228 |
+
" split.metadata = {\"audio_url\": split.metadata[\"audio_url\"]}"
|
| 229 |
+
]
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"cell_type": "code",
|
| 233 |
+
"execution_count": 7,
|
| 234 |
+
"id": "8cb50bca-9dd5-47a8-839b-1e21a3614c4c",
|
| 235 |
+
"metadata": {},
|
| 236 |
+
"outputs": [],
|
| 237 |
+
"source": [
|
| 238 |
+
"# Remove old database files if any\n",
|
| 239 |
+
"# !rm -rf ./docs/chroma "
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"cell_type": "markdown",
|
| 244 |
+
"id": "62e12915-9648-44cc-a80f-974c129eae5d",
|
| 245 |
+
"metadata": {},
|
| 246 |
+
"source": [
|
| 247 |
+
"Creating vectordb with embedded splits."
|
| 248 |
+
]
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "code",
|
| 252 |
+
"execution_count": 8,
|
| 253 |
+
"id": "bdd1b7fa-7fb0-4309-9092-5826a67190ab",
|
| 254 |
+
"metadata": {
|
| 255 |
+
"scrolled": true
|
| 256 |
+
},
|
| 257 |
+
"outputs": [],
|
| 258 |
+
"source": [
|
| 259 |
+
"embedding = OpenAIEmbeddings()\n",
|
| 260 |
+
"persist_directory = 'docs/chroma/'\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"vectordb = Chroma.from_documents(\n",
|
| 263 |
+
" documents=splits,\n",
|
| 264 |
+
" embedding=embedding,\n",
|
| 265 |
+
" persist_directory=persist_directory\n",
|
| 266 |
+
")"
|
| 267 |
+
]
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"cell_type": "markdown",
|
| 271 |
+
"id": "0b1bc29c-69fa-4ba5-b389-224f6e002383",
|
| 272 |
+
"metadata": {},
|
| 273 |
+
"source": [
|
| 274 |
+
"# Creating a chat agent"
|
| 275 |
+
]
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"cell_type": "markdown",
|
| 279 |
+
"id": "1e1cd0a9-3067-4396-b57a-f0cb9c144a05",
|
| 280 |
+
"metadata": {},
|
| 281 |
+
"source": [
|
| 282 |
+
"## Multi-query retriever"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"cell_type": "markdown",
|
| 287 |
+
"id": "2005ba4b-2785-4bcf-a680-d87085d0c2f5",
|
| 288 |
+
"metadata": {},
|
| 289 |
+
"source": [
|
| 290 |
+
"Create a retriever that will generate multiple prompts similar to the user prompt and find splits relevant to the questions."
|
| 291 |
+
]
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"cell_type": "code",
|
| 295 |
+
"execution_count": 9,
|
| 296 |
+
"id": "513e5fdc-c854-40f0-a6ba-c874bf80f288",
|
| 297 |
+
"metadata": {},
|
| 298 |
+
"outputs": [],
|
| 299 |
+
"source": [
|
| 300 |
+
"# Output parser will split the LLM result into a list of queries\n",
|
| 301 |
+
"class LineList(BaseModel):\n",
|
| 302 |
+
" # \"lines\" is the key (attribute name) of the parsed output\n",
|
| 303 |
+
" lines: List[str] = Field(description=\"Lines of text\")\n",
|
| 304 |
+
"\n",
|
| 305 |
+
"\n",
|
| 306 |
+
"class LineListOutputParser(PydanticOutputParser):\n",
|
| 307 |
+
" def __init__(self) -> None:\n",
|
| 308 |
+
" super().__init__(pydantic_object=LineList)\n",
|
| 309 |
+
"\n",
|
| 310 |
+
" def parse(self, text: str) -> LineList:\n",
|
| 311 |
+
" lines = text.strip().split(\"\\n\")\n",
|
| 312 |
+
" return LineList(lines=lines)\n",
|
| 313 |
+
"\n",
|
| 314 |
+
"\n",
|
| 315 |
+
"output_parser = LineListOutputParser()\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"QUERY_PROMPT = PromptTemplate(\n",
|
| 318 |
+
" input_variables=[\"question\"],\n",
|
| 319 |
+
" template=\"\"\"You are an AI language model assistant. Your task is to generate five \n",
|
| 320 |
+
" different versions of the given user question to retrieve relevant documents from a vector \n",
|
| 321 |
+
" database. By generating multiple perspectives on the user question, your goal is to help\n",
|
| 322 |
+
" the user overcome some of the limitations of the distance-based similarity search. \n",
|
| 323 |
+
" Provide these alternative questions separated by newlines.\n",
|
| 324 |
+
" Original question: {question}\"\"\",\n",
|
| 325 |
+
")\n",
|
| 326 |
+
"llm = ChatOpenAI(temperature=0)\n",
|
| 327 |
+
"\n",
|
| 328 |
+
"multi_query_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)\n",
|
| 329 |
+
"\n",
|
| 330 |
+
"retriever = MultiQueryRetriever(\n",
|
| 331 |
+
" retriever=vectordb.as_retriever(), llm_chain=multi_query_chain, parser_key=\"lines\"\n",
|
| 332 |
+
")"
|
| 333 |
+
]
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"cell_type": "markdown",
|
| 337 |
+
"id": "35a85f7b-e3d3-4966-a536-cd764902bbf4",
|
| 338 |
+
"metadata": {},
|
| 339 |
+
"source": [
|
| 340 |
+
"## Tools"
|
| 341 |
+
]
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"cell_type": "markdown",
|
| 345 |
+
"id": "b9306ddb-3b03-4b5c-a23f-63ef560f5185",
|
| 346 |
+
"metadata": {},
|
| 347 |
+
"source": [
|
| 348 |
+
"We have to convert the retriever to the tool so it can be used by an agent."
|
| 349 |
+
]
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"cell_type": "code",
|
| 353 |
+
"execution_count": 10,
|
| 354 |
+
"id": "a53d9a43-9bd8-44eb-838b-5f51ca1a8274",
|
| 355 |
+
"metadata": {},
|
| 356 |
+
"outputs": [],
|
| 357 |
+
"source": [
|
| 358 |
+
"search_related_documents = create_retriever_tool(\n",
|
| 359 |
+
" retriever,\n",
|
| 360 |
+
" \"RAG\",\n",
|
| 361 |
+
" \"Retrieves context related to the question from the audio provided by the user.\",\n",
|
| 362 |
+
")\n",
|
| 363 |
+
"\n",
|
| 364 |
+
"tools = [search_related_documents]\n",
|
| 365 |
+
"functions = [format_tool_to_openai_function(f) for f in tools] # convert tools to OpenAI functions"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"cell_type": "markdown",
|
| 370 |
+
"id": "293525ac-5cb9-426c-b9ac-727d763ba037",
|
| 371 |
+
"metadata": {},
|
| 372 |
+
"source": [
|
| 373 |
+
"## Agent chain"
|
| 374 |
+
]
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"cell_type": "code",
|
| 378 |
+
"execution_count": 11,
|
| 379 |
+
"id": "5b4563fa-c412-4d95-ab82-ee62a08fe2c9",
|
| 380 |
+
"metadata": {},
|
| 381 |
+
"outputs": [],
|
| 382 |
+
"source": [
|
| 383 |
+
"model = ChatOpenAI(temperature=0).bind(functions=functions)\n",
|
| 384 |
+
"\n",
|
| 385 |
+
"prompt = ChatPromptTemplate.from_messages([\n",
|
| 386 |
+
" (\"system\", \"You are helpful assistant. The user provided an audio as context to this chat. You can use it to retrieve context to answer the questions.\"),\n",
|
| 387 |
+
" MessagesPlaceholder(variable_name=\"chat_history\"),\n",
|
| 388 |
+
" (\"user\", \"{input}\"),\n",
|
| 389 |
+
" MessagesPlaceholder(variable_name=\"agent_scratchpad\")\n",
|
| 390 |
+
"])"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"cell_type": "code",
|
| 395 |
+
"execution_count": 12,
|
| 396 |
+
"id": "b630f996-1016-4533-9887-7660285f4113",
|
| 397 |
+
"metadata": {},
|
| 398 |
+
"outputs": [],
|
| 399 |
+
"source": [
|
| 400 |
+
"agent_chain = RunnablePassthrough.assign(\n",
|
| 401 |
+
" agent_scratchpad= lambda x: format_to_openai_functions(x[\"intermediate_steps\"])\n",
|
| 402 |
+
") | prompt | model | OpenAIFunctionsAgentOutputParser()"
|
| 403 |
+
]
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"cell_type": "markdown",
|
| 407 |
+
"id": "cf9fbbab-732a-4b39-903c-8b270e1ac9b2",
|
| 408 |
+
"metadata": {},
|
| 409 |
+
"source": [
|
| 410 |
+
"## Agent executor"
|
| 411 |
+
]
|
| 412 |
+
},
|
| 413 |
+
{
|
| 414 |
+
"cell_type": "markdown",
|
| 415 |
+
"id": "bf8bde4c-4b57-4b8f-bfa6-fb5b6a7e467c",
|
| 416 |
+
"metadata": {},
|
| 417 |
+
"source": [
|
| 418 |
+
"Create the agent executor that will run the agent chain until the stop condition is met (in our case the system defines the stop on its own)."
|
| 419 |
+
]
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"cell_type": "code",
|
| 423 |
+
"execution_count": 13,
|
| 424 |
+
"id": "cd736bc2-185b-46c7-8049-f32ba2566893",
|
| 425 |
+
"metadata": {},
|
| 426 |
+
"outputs": [],
|
| 427 |
+
"source": [
|
| 428 |
+
"# Define memory type to store the conversation history\n",
|
| 429 |
+
"memory = ConversationBufferMemory(return_messages=True,memory_key=\"chat_history\")\n",
|
| 430 |
+
"agent_executor = AgentExecutor(agent=agent_chain, tools=tools, verbose=True, memory=memory)"
|
| 431 |
+
]
|
| 432 |
+
},
|
| 433 |
+
{
|
| 434 |
+
"cell_type": "code",
|
| 435 |
+
"execution_count": 14,
|
| 436 |
+
"id": "256219a3-ed6e-4ad8-98a4-62b643d00c72",
|
| 437 |
+
"metadata": {},
|
| 438 |
+
"outputs": [],
|
| 439 |
+
"source": [
|
| 440 |
+
"# Set logging for the queries\n",
|
| 441 |
+
"logging.basicConfig()\n",
|
| 442 |
+
"logging.getLogger(\"langchain.retrievers.multi_query\").setLevel(logging.INFO)"
|
| 443 |
+
]
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"cell_type": "markdown",
|
| 447 |
+
"id": "003ed099-093a-4301-b0bd-e6907a41d034",
|
| 448 |
+
"metadata": {},
|
| 449 |
+
"source": [
|
| 450 |
+
"# Test "
|
| 451 |
+
]
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"cell_type": "markdown",
|
| 455 |
+
"id": "0848a164-649a-474a-a3fb-63f3f0a3ec74",
|
| 456 |
+
"metadata": {},
|
| 457 |
+
"source": [
|
| 458 |
+
"Let's try to make some questions not related to the document and check if it is able to store the chat history."
|
| 459 |
+
]
|
| 460 |
+
},
|
| 461 |
+
{
|
| 462 |
+
"cell_type": "code",
|
| 463 |
+
"execution_count": 15,
|
| 464 |
+
"id": "1f6fe333-fb6a-4981-a3ef-2ebe7125a9db",
|
| 465 |
+
"metadata": {},
|
| 466 |
+
"outputs": [
|
| 467 |
+
{
|
| 468 |
+
"name": "stdout",
|
| 469 |
+
"output_type": "stream",
|
| 470 |
+
"text": [
|
| 471 |
+
"\n",
|
| 472 |
+
"\n",
|
| 473 |
+
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
| 474 |
+
"\u001b[32;1m\u001b[1;3mHello Logis! Nice to meet you too. How can I assist you today?\u001b[0m\n",
|
| 475 |
+
"\n",
|
| 476 |
+
"\u001b[1m> Finished chain.\u001b[0m\n"
|
| 477 |
+
]
|
| 478 |
+
},
|
| 479 |
+
{
|
| 480 |
+
"data": {
|
| 481 |
+
"text/plain": [
|
| 482 |
+
"{'input': 'Hi! Nice to meet you. My name is Logis.',\n",
|
| 483 |
+
" 'chat_history': [HumanMessage(content='Hi! Nice to meet you. My name is Logis.'),\n",
|
| 484 |
+
" AIMessage(content='Hello Logis! Nice to meet you too. How can I assist you today?')],\n",
|
| 485 |
+
" 'output': 'Hello Logis! Nice to meet you too. How can I assist you today?'}"
|
| 486 |
+
]
|
| 487 |
+
},
|
| 488 |
+
"execution_count": 15,
|
| 489 |
+
"metadata": {},
|
| 490 |
+
"output_type": "execute_result"
|
| 491 |
+
}
|
| 492 |
+
],
|
| 493 |
+
"source": [
|
| 494 |
+
"agent_executor.invoke({\"input\": \"Hi! Nice to meet you. My name is Logis.\"})"
|
| 495 |
+
]
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"cell_type": "code",
|
| 499 |
+
"execution_count": 16,
|
| 500 |
+
"id": "1563cf39-b209-44ff-9f85-46de4f844fb7",
|
| 501 |
+
"metadata": {},
|
| 502 |
+
"outputs": [
|
| 503 |
+
{
|
| 504 |
+
"name": "stdout",
|
| 505 |
+
"output_type": "stream",
|
| 506 |
+
"text": [
|
| 507 |
+
"\n",
|
| 508 |
+
"\n",
|
| 509 |
+
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n"
|
| 510 |
+
]
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"name": "stderr",
|
| 514 |
+
"output_type": "stream",
|
| 515 |
+
"text": [
|
| 516 |
+
"WARNING:langchain.llms.base:Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).\n"
|
| 517 |
+
]
|
| 518 |
+
},
|
| 519 |
+
{
|
| 520 |
+
"name": "stdout",
|
| 521 |
+
"output_type": "stream",
|
| 522 |
+
"text": [
|
| 523 |
+
"\u001b[32;1m\u001b[1;3mYour name is Logis.\u001b[0m\n",
|
| 524 |
+
"\n",
|
| 525 |
+
"\u001b[1m> Finished chain.\u001b[0m\n"
|
| 526 |
+
]
|
| 527 |
+
},
|
| 528 |
+
{
|
| 529 |
+
"data": {
|
| 530 |
+
"text/plain": [
|
| 531 |
+
"{'input': 'What is my name?',\n",
|
| 532 |
+
" 'chat_history': [HumanMessage(content='Hi! Nice to meet you. My name is Logis.'),\n",
|
| 533 |
+
" AIMessage(content='Hello Logis! Nice to meet you too. How can I assist you today?'),\n",
|
| 534 |
+
" HumanMessage(content='What is my name?'),\n",
|
| 535 |
+
" AIMessage(content='Your name is Logis.')],\n",
|
| 536 |
+
" 'output': 'Your name is Logis.'}"
|
| 537 |
+
]
|
| 538 |
+
},
|
| 539 |
+
"execution_count": 16,
|
| 540 |
+
"metadata": {},
|
| 541 |
+
"output_type": "execute_result"
|
| 542 |
+
}
|
| 543 |
+
],
|
| 544 |
+
"source": [
|
| 545 |
+
"agent_executor.invoke({\"input\": \"What is my name?\"})"
|
| 546 |
+
]
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"cell_type": "markdown",
|
| 550 |
+
"id": "db293ce6-6911-46f0-a486-d13fbc86a1ea",
|
| 551 |
+
"metadata": {},
|
| 552 |
+
"source": [
|
| 553 |
+
"Everything works well, what about retriving?"
|
| 554 |
+
]
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"cell_type": "code",
|
| 558 |
+
"execution_count": 17,
|
| 559 |
+
"id": "5af7e529-273d-4f16-9eae-d3057b880c26",
|
| 560 |
+
"metadata": {},
|
| 561 |
+
"outputs": [
|
| 562 |
+
{
|
| 563 |
+
"name": "stdout",
|
| 564 |
+
"output_type": "stream",
|
| 565 |
+
"text": [
|
| 566 |
+
"\n",
|
| 567 |
+
"\n",
|
| 568 |
+
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
| 569 |
+
"\u001b[32;1m\u001b[1;3m\n",
|
| 570 |
+
"Invoking: `RAG` with `audio`\n",
|
| 571 |
+
"\n",
|
| 572 |
+
"\n",
|
| 573 |
+
"\u001b[0m"
|
| 574 |
+
]
|
| 575 |
+
},
|
| 576 |
+
{
|
| 577 |
+
"name": "stderr",
|
| 578 |
+
"output_type": "stream",
|
| 579 |
+
"text": [
|
| 580 |
+
"INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are some applications of audio technology?', '2. How does audio compression work?', '3. Can you recommend any audio editing software?', '4. What are the different types of audio formats?', '5. How does audio streaming work?']\n"
|
| 581 |
+
]
|
| 582 |
+
},
|
| 583 |
+
{
|
| 584 |
+
"name": "stdout",
|
| 585 |
+
"output_type": "stream",
|
| 586 |
+
"text": [
|
| 587 |
+
"\u001b[36;1m\u001b[1;3m[Document(page_content=\"of your application. Maybe that's search, maybe that is some generative component. Whatever it is, at least that's the way I think of maybe Harrison can maybe comment on this a little bit, but I think with the self improvement, I think you're right that finding ways to make that self improvement persist from one conversation to another is huge. And DSP seems like a promising way to do that. I think in terms of the retrieval and long term memory side for agents, the way it normally looks is that the amount of information that you're searching over is going to generally be smaller. Right? It's like the memory of a single agent. That's more like the scale of the amount of information a person consumes on the internet. So we're talking hundreds of documents, thousands of documents rather than millions. And then you also are maybe doing some annotation of that at the same time with the language model. So it's not just a document, it's a document with additional metadata, which is a pattern\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus'}), Document(page_content=\"AGI. First of all, I want John Carmack in April of 2023 to update that. Like, how many do you think we have? But the second thing that he said, which I thought was really interesting and true, is that a lot of these ideas likely have already been created and explored academically. So there was a ton of just incredible agent work done even in the 60s, but especially in the mid and late eighty s, there were three architectures which are really important to note here atlantis, BB one and soar and there are others as well. But my suspicion is that something else that we could do to kind of push this along a lot more quickly is to go back and see, okay, well, a lot of this thinking has already been done for us, but they didn't have any of the engineering to be able to implement this. And a lot of the science to implement this. Okay, what do we have? It looks like we can now do these things in 2023. What? In Atlantis. BB one Soar and similar types of architectures, cognitive architectures.\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus'}), Document(page_content=\"several times in each query. How can you make this efficient and how can you make sure this adapts to the actual queries people ask? So our solution for this is you deploy your agent, and in particular you deploy this tool in front of users. You gather a bunch of questions from them. These questions don't have labels, they don't have answers, but you could still gather them and save them and then you could just say, hey DST, can you please compile my program that we built here and use these user questions? And I'd like you to use in this pipeline maybe the default language model that you have, which might be configured to be T five large on GitHub. The default is ada. OpenAI. Ada. So we've been playing with this for a while, and if you compile a DSP program that does multi hop search into T five large, you match the quality of retrieval that you're getting from DaVinci 3.5, and you only lose a very small kind of margin in terms of the final answer accuracy. But you don't need to go to\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus'}), Document(page_content='on trying to get models that generalize better, cober v one generalized better to new domains. Just by happenstance, it worked better. But Cober v two was purpose built for this by a more resilient denoise training strategy. Look, I refer you to the paper for the details of that. But we also built this residual compression technique so that each of the vectors in the Colbert storage are really small. So if you work with Colbert V two right now, even though we have more than ten times as many vectors as your off the shelf, by encoder, the vectors are actually taking no more space than a standard DPR retriever or these single vector retrievers, because each of our vectors could be encoded in as little as 20 bytes or something like that. So we have applied this in domain and out of domain. So the two tables on the right are out of domain tests, a whole bunch of them. Beer we also introduced this Latte, a play on the words there and across. I think it was something like you could count', metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus'}), Document(page_content='on trying to get models that generalize better, cober v one generalized better to new domains. Just by happenstance, it worked better. But Cober v Two was purpose built for this by a more resilient denoise training strategy. Look, I refer you to the paper for the details of that. But we also built this residual compression technique so that each of the vectors in the Colbert storage are really small. So if you work with Colbert V two right now, even though we have more than ten times as many vectors as your off the shelf, by encoder, the vectors are actually taking no more space than a standard DPR retriever or these single vector retrievers, because each of our vectors could be encoded in as little as 20 bytes or something like that. So we have applied this in domain and out of domain. So the two tables on the right are out of domain tests, a whole bunch of them. Beer we also introduced this Latte, a play on the words there and across. I think it was something like you could count', metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus'}), Document(page_content=\"this can raise accuracy on benchmarks like hotpot QA from around 30% in a zero shot setting to a 35% in a few shot setting that you did not write the prompts for. So it can teach itself how to generate queries and how to use the passages and how to control the flow, et cetera. And you could easily add extensions to the pipeline that keep praising this. A very simple like few line addition makes this 41% exact match. So another sort of magical features, and I'll conclude with this, is the compile primitive in DSP. So let's say that and a lot of people ask things related to this. Let's say that you have this program now, which is a bootstrapped, few shot react agent built in DSP as a tool that you can use in your language conversational pipeline. But it's kind of expensive because it's going to talk to GPT 3.5 or GPT Four or whatever it is, several times in each query. How can you make this efficient and how can you make sure this adapts to the actual queries people ask? So our solution\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus'}), Document(page_content=\"So you can basically evaluate that. And what is the generation that is coming out of that? Is that hallucinated or is it not? Right, so I think having that separation because that means you might have solved the retrieval problem, but then you have more of a prompt problem, the language model, how you tune that so you can separate, how you spend the rest of the time improving it. I think that makes sense. Yeah. Sort of treating these as separate types of bugs. Separate like exactly. Okay, well, then, yeah, we'll hear again from Joe when we have our open Q A session at the end. But next up we have Omar. So let me pull you up. So Omar was going to share primarily about a specific technique and some models for the specific technique of multi vector retrieval. So, yeah, Omar, I think you can take it away. All right. Yeah. Thanks for hosting us, Harrison, and thanks for joining Charles and leading the discussion. Joe, I really enjoyed your talk. We've been in touch for a long time since\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_retrieval_webinar.opus'}), Document(page_content=\"kind of show you guys what I mean if I can share my screen here. Let me know if you can see my screen. There's a Q and A button right next to the chat. Should be on the right hand side of your screen. You can put questions there and vote on questions that you want to hear the speakers discuss. And then Matt, you might want to zoom in a good bit on that one. A little hard to see. I think Matt Lang chain agent capped out. It broke. Yeah, we must hit the 32K context limit. That's my guess. Yeah, that's what it was. He tried to stuff it too much. Yeah. While we see what's happening there, let's see, there's a question, highly voted question from the Q A that I'd love to hear. Johan, should you talk like we're starting to see sort of like patterns of how agents work both at the level of low level prompting techniques about thought observation, action, and then also sort of like control flow techniques and augmentation techniques. So like have a memory stream that you can access that's\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus'}), Document(page_content=\"the system prompt. And I can just kind of show you guys what I mean if I can share my screen here. Let me know if you can see my screen. There's a Q and A button right next to the chat. Should be on the right hand side of your screen. You can put questions there and vote on questions that you want to hear the speakers discuss. And then, Matt, you might want to zoom in a good bit on that one. A little hard to see. I think Matt Lang chain agent capped out. It broke. Yeah, we must hit the 32K context limit. That's my guess. Yeah, that's what it was. He tried to stuff it too much. Yeah. While we see what's happening there, let's see, there's a question, highly voted question from the Q A that I'd love to hear. Johan, should you talk like we're starting to see sort of like patterns of how agents work, both at the level of low level prompting techniques about thought observation action, and then also sort of like control flow techniques and augmentation techniques. So, like, have a memory\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus'})]\u001b[0m\u001b[32;1m\u001b[1;3mThe topics discussed in the audio include:\n",
|
| 588 |
+
"1. Self-improvement and the use of DSP (Dialogue System Platform) for persistent self-improvement in conversations.\n",
|
| 589 |
+
"2. Different architectures and cognitive frameworks for building agents, such as Atlantis, BB one, Soar, and others.\n",
|
| 590 |
+
"3. Deploying agents and gathering user questions to compile a program using DSP.\n",
|
| 591 |
+
"4. Techniques for efficient and adaptive querying, including multi-hop search and the use of language models like T5 and Ada.\n",
|
| 592 |
+
"5. Compression techniques for reducing the storage space required for vectors in retrieval systems.\n",
|
| 593 |
+
"6. Improving model generalization and accuracy through denoise training and residual compression.\n",
|
| 594 |
+
"7. The use of DSP's compile primitive for efficient and adaptive conversation flow.\n",
|
| 595 |
+
"8. Separating retrieval and prompt problems in language models.\n",
|
| 596 |
+
"9. Techniques for multi-vector retrieval and memory access in agents.\n",
|
| 597 |
+
"10. Limitations and challenges in agent development, such as context limits and system prompts.\n",
|
| 598 |
+
"\n",
|
| 599 |
+
"Please note that the provided audio contains multiple discussions, so these topics are a summary of the main points covered.\u001b[0m\n",
|
| 600 |
+
"\n",
|
| 601 |
+
"\u001b[1m> Finished chain.\u001b[0m\n"
|
| 602 |
+
]
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"data": {
|
| 606 |
+
"text/plain": [
|
| 607 |
+
"{'input': 'What topics are discussed in the audio?',\n",
|
| 608 |
+
" 'chat_history': [HumanMessage(content='Hi! Nice to meet you. My name is Logis.'),\n",
|
| 609 |
+
" AIMessage(content='Hello Logis! Nice to meet you too. How can I assist you today?'),\n",
|
| 610 |
+
" HumanMessage(content='What is my name?'),\n",
|
| 611 |
+
" AIMessage(content='Your name is Logis.'),\n",
|
| 612 |
+
" HumanMessage(content='What topics are discussed in the audio?'),\n",
|
| 613 |
+
" AIMessage(content=\"The topics discussed in the audio include:\\n1. Self-improvement and the use of DSP (Dialogue System Platform) for persistent self-improvement in conversations.\\n2. Different architectures and cognitive frameworks for building agents, such as Atlantis, BB one, Soar, and others.\\n3. Deploying agents and gathering user questions to compile a program using DSP.\\n4. Techniques for efficient and adaptive querying, including multi-hop search and the use of language models like T5 and Ada.\\n5. Compression techniques for reducing the storage space required for vectors in retrieval systems.\\n6. Improving model generalization and accuracy through denoise training and residual compression.\\n7. The use of DSP's compile primitive for efficient and adaptive conversation flow.\\n8. Separating retrieval and prompt problems in language models.\\n9. Techniques for multi-vector retrieval and memory access in agents.\\n10. Limitations and challenges in agent development, such as context limits and system prompts.\\n\\nPlease note that the provided audio contains multiple discussions, so these topics are a summary of the main points covered.\")],\n",
|
| 614 |
+
" 'output': \"The topics discussed in the audio include:\\n1. Self-improvement and the use of DSP (Dialogue System Platform) for persistent self-improvement in conversations.\\n2. Different architectures and cognitive frameworks for building agents, such as Atlantis, BB one, Soar, and others.\\n3. Deploying agents and gathering user questions to compile a program using DSP.\\n4. Techniques for efficient and adaptive querying, including multi-hop search and the use of language models like T5 and Ada.\\n5. Compression techniques for reducing the storage space required for vectors in retrieval systems.\\n6. Improving model generalization and accuracy through denoise training and residual compression.\\n7. The use of DSP's compile primitive for efficient and adaptive conversation flow.\\n8. Separating retrieval and prompt problems in language models.\\n9. Techniques for multi-vector retrieval and memory access in agents.\\n10. Limitations and challenges in agent development, such as context limits and system prompts.\\n\\nPlease note that the provided audio contains multiple discussions, so these topics are a summary of the main points covered.\"}"
|
| 615 |
+
]
|
| 616 |
+
},
|
| 617 |
+
"execution_count": 17,
|
| 618 |
+
"metadata": {},
|
| 619 |
+
"output_type": "execute_result"
|
| 620 |
+
}
|
| 621 |
+
],
|
| 622 |
+
"source": [
|
| 623 |
+
"agent_executor.invoke({\"input\": \"What topics are discussed in the audio?\"})"
|
| 624 |
+
]
|
| 625 |
+
},
|
| 626 |
+
{
|
| 627 |
+
"cell_type": "markdown",
|
| 628 |
+
"id": "4f7b2307-da5b-4fd0-9fcb-fd2ba7b4d051",
|
| 629 |
+
"metadata": {},
|
| 630 |
+
"source": [
|
| 631 |
+
"# Creating UI for the chat"
|
| 632 |
+
]
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"cell_type": "code",
|
| 636 |
+
"execution_count": 18,
|
| 637 |
+
"id": "f30a099e-a4fd-4013-aaa2-c730715dc538",
|
| 638 |
+
"metadata": {},
|
| 639 |
+
"outputs": [
|
| 640 |
+
{
|
| 641 |
+
"name": "stdout",
|
| 642 |
+
"output_type": "stream",
|
| 643 |
+
"text": [
|
| 644 |
+
"Running on local URL: http://127.0.0.1:7860\n",
|
| 645 |
+
"\n",
|
| 646 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
| 647 |
+
]
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"data": {
|
| 651 |
+
"text/html": [
|
| 652 |
+
"<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 653 |
+
],
|
| 654 |
+
"text/plain": [
|
| 655 |
+
"<IPython.core.display.HTML object>"
|
| 656 |
+
]
|
| 657 |
+
},
|
| 658 |
+
"metadata": {},
|
| 659 |
+
"output_type": "display_data"
|
| 660 |
+
},
|
| 661 |
+
{
|
| 662 |
+
"data": {
|
| 663 |
+
"text/plain": []
|
| 664 |
+
},
|
| 665 |
+
"execution_count": 18,
|
| 666 |
+
"metadata": {},
|
| 667 |
+
"output_type": "execute_result"
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"name": "stdout",
|
| 671 |
+
"output_type": "stream",
|
| 672 |
+
"text": [
|
| 673 |
+
"\n",
|
| 674 |
+
"\n",
|
| 675 |
+
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
| 676 |
+
"\u001b[32;1m\u001b[1;3m\n",
|
| 677 |
+
"Invoking: `RAG` with `main topics from the webinars in bulletpoints`\n",
|
| 678 |
+
"\n",
|
| 679 |
+
"\n",
|
| 680 |
+
"\u001b[0m"
|
| 681 |
+
]
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"name": "stderr",
|
| 685 |
+
"output_type": "stream",
|
| 686 |
+
"text": [
|
| 687 |
+
"INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are the key subjects covered in the webinars, presented in bullet point format?', '2. Can you provide a list of the main topics discussed in the webinars, using bullet points?', '3. In bullet point form, what are the primary themes addressed in the webinars?', '4. Could you summarize the main subjects covered in the webinars, using bullet points?', '5. What are the main discussion points from the webinars, presented as bullet points?']\n"
|
| 688 |
+
]
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"name": "stdout",
|
| 692 |
+
"output_type": "stream",
|
| 693 |
+
"text": [
|
| 694 |
+
"\u001b[36;1m\u001b[1;3m[Document(page_content=\"better task prioritization agent for something like baby AGI. And the reason I bring that up is I think there's a lot of relationship between the way we operate, the way we think our brains, and as we think about building these autonomous agents that there's a lot of cross learning that we should absolutely be talking about. I did want to bring one last major topic in our ten minutes that we have remaining here, which was the question of safety. A lot of people are asking about security, like narrowly considered, which is like prompt, is this thing going to pseudo RMRF my machine? Is this thing going to drain my PayPal account? And then there's also the broader question of what happens when we unleash a whole bunch of much more capable web scraping and API hitting bots. So I'd love to hear just from each sort of panel member in what they think the path is for assuaging those concerns that people have. So I'd like to start with Shinyu. Yeah, actually I'm writing something about this.\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus'}), Document(page_content=\"build a much better task prioritization agent for something like baby AGI. And the reason I bring that up is I think there's a lot of relationship between the way we operate, the way we think our brains, and as we think about building these autonomous agents that there's a lot of cross learning that we should absolutely be talking about. I did want to bring one last major topic in our ten minutes that we have remaining here, which was the question of safety. A lot of people are asking about security, like narrowly considered, which is like prompt, is this thing going to pseudo RMRF my machine? Is this thing going to drain my PayPal account? And then there's also the broader question of what happens when we unleash a whole bunch of much more capable web scraping and API hitting bots. So I'd love to hear just from each sort of panel member in what they think the path is for assuaging those concerns that people have. So I'd like to start with Shinyu. Yeah, actually I'm writing something\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus'}), Document(page_content=\"nother webinar just on those last three responses, but fortunately, we've run out of time in this webinar. There was one question about Langchain that was the highest upvoted one that I wanted to hear from Harrison on. But before I kick that one over to Harrison, I did want to thank our panelists both for being here and answering questions and engaging with each other and with the audience, and also just generally for doing really incredible work and putting it out there for everybody else to learn from. Not everybody does that. Lots of people like to jealously hide their secrets. And so getting out there, putting things that other people can build on, it's something that we all, like everybody else in the community, really appreciates. So thanks to Matt, Johe and Shunyu for everything. And thank you, Charles, for moderating this. You're always the oh, thanks. It turns out that iteratively improving the criticism is actually the easiest part. The initial generation is the hard part.\", metadata={'audio_url': 'https://storage.googleapis.com/aai-web-samples/langchain_agents_webinar.opus'})]\u001b[0m\u001b[32;1m\u001b[1;3mSure! Here are the main topics discussed in the webinars:\n",
|
| 695 |
+
"\n",
|
| 696 |
+
"- Relationship between the way we operate and think, and building autonomous agents\n",
|
| 697 |
+
"- Task prioritization for building better agents\n",
|
| 698 |
+
"- Safety concerns in AI, including security and potential misuse of capabilities\n",
|
| 699 |
+
"- Assuaging concerns about security and the impact of web scraping and API hitting bots\n",
|
| 700 |
+
"- Panelists' perspectives on addressing safety concerns\n",
|
| 701 |
+
"- Langchain and its relevance to the discussion\n",
|
| 702 |
+
"- Appreciation for panelists' work and sharing knowledge with the community\n",
|
| 703 |
+
"- Challenges in the initial generation of AI models and the importance of iterative improvement\n",
|
| 704 |
+
"\n",
|
| 705 |
+
"Please note that these topics are a summary of the main points covered in the webinars.\u001b[0m\n",
|
| 706 |
+
"\n",
|
| 707 |
+
"\u001b[1m> Finished chain.\u001b[0m\n",
|
| 708 |
+
"\n",
|
| 709 |
+
"\n",
|
| 710 |
+
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
| 711 |
+
"\u001b[32;1m\u001b[1;3mThe term \"agents\" refers to software programs or systems that are designed to perform specific tasks or functions autonomously. These agents can range from simple rule-based systems to more complex artificial intelligence (AI) systems. Agents are typically built to interact with users or other systems, gather information, make decisions, and perform actions based on predefined rules or learned behaviors.\n",
|
| 712 |
+
"\n",
|
| 713 |
+
"In the context of the audio, the term \"agents\" specifically refers to conversational agents or dialogue systems. These agents are designed to engage in conversations with users, understand their queries or requests, and provide relevant responses or actions. The discussions in the audio explore various architectures, frameworks, and techniques for building and improving these conversational agents, including topics like self-improvement, efficient querying, model generalization, and memory access.\n",
|
| 714 |
+
"\n",
|
| 715 |
+
"Overall, agents are software entities that can perform tasks autonomously, and in the context of the audio, the focus is on conversational agents or dialogue systems.\u001b[0m\n",
|
| 716 |
+
"\n",
|
| 717 |
+
"\u001b[1m> Finished chain.\u001b[0m\n",
|
| 718 |
+
"\n",
|
| 719 |
+
"\n",
|
| 720 |
+
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
| 721 |
+
"\u001b[32;1m\u001b[1;3mYou're welcome! I'm glad I could provide you with the information you needed. If you have any more questions, feel free to ask. I'm here to help!\u001b[0m\n",
|
| 722 |
+
"\n",
|
| 723 |
+
"\u001b[1m> Finished chain.\u001b[0m\n"
|
| 724 |
+
]
|
| 725 |
+
}
|
| 726 |
+
],
|
| 727 |
+
"source": [
|
| 728 |
+
"def predict(message, history):\n",
|
| 729 |
+
" history = []\n",
|
| 730 |
+
" for human, ai in history:\n",
|
| 731 |
+
" history.append(HumanMessage(content=human))\n",
|
| 732 |
+
" history.append(AIMessage(content=ai))\n",
|
| 733 |
+
" history.append(HumanMessage(content=message))\n",
|
| 734 |
+
" response = agent_executor.invoke({\"input\": message})\n",
|
| 735 |
+
" return response[\"output\"]\n",
|
| 736 |
+
" \n",
|
| 737 |
+
"gr.ChatInterface(predict).queue().launch()"
|
| 738 |
+
]
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"cell_type": "code",
|
| 742 |
+
"execution_count": null,
|
| 743 |
+
"id": "e6e6c9a1-03ad-4649-bfaa-972a24a448c6",
|
| 744 |
+
"metadata": {},
|
| 745 |
+
"outputs": [],
|
| 746 |
+
"source": []
|
| 747 |
+
}
|
| 748 |
+
],
|
| 749 |
+
"metadata": {
|
| 750 |
+
"kernelspec": {
|
| 751 |
+
"display_name": "Python 3 (ipykernel)",
|
| 752 |
+
"language": "python",
|
| 753 |
+
"name": "python3"
|
| 754 |
+
},
|
| 755 |
+
"language_info": {
|
| 756 |
+
"codemirror_mode": {
|
| 757 |
+
"name": "ipython",
|
| 758 |
+
"version": 3
|
| 759 |
+
},
|
| 760 |
+
"file_extension": ".py",
|
| 761 |
+
"mimetype": "text/x-python",
|
| 762 |
+
"name": "python",
|
| 763 |
+
"nbconvert_exporter": "python",
|
| 764 |
+
"pygments_lexer": "ipython3",
|
| 765 |
+
"version": "3.11.1"
|
| 766 |
+
}
|
| 767 |
+
},
|
| 768 |
+
"nbformat": 4,
|
| 769 |
+
"nbformat_minor": 5
|
| 770 |
+
}
|
notebooks/basic_system.ipynb
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 3,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"name": "stderr",
|
| 17 |
+
"output_type": "stream",
|
| 18 |
+
"text": [
|
| 19 |
+
"DEPRECATION: Loading egg at c:\\programdata\\anaconda3\\lib\\site-packages\\vboxapi-1.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..\n"
|
| 20 |
+
]
|
| 21 |
+
}
|
| 22 |
+
],
|
| 23 |
+
"source": [
|
| 24 |
+
"%pip install --upgrade --quiet langchain-core langchain-community langchain-openai python-dotenv gradio"
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"cell_type": "code",
|
| 29 |
+
"execution_count": 4,
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"outputs": [
|
| 32 |
+
{
|
| 33 |
+
"name": "stdout",
|
| 34 |
+
"output_type": "stream",
|
| 35 |
+
"text": [
|
| 36 |
+
"Defaulting to user installation because normal site-packages is not writeable\n",
|
| 37 |
+
"Requirement already satisfied: python-dotenv==1.0.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (1.0.0)\n",
|
| 38 |
+
"Requirement already satisfied: gradio in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (4.27.0)\n",
|
| 39 |
+
"Requirement already satisfied: aiofiles<24.0,>=22.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (22.1.0)\n",
|
| 40 |
+
"Requirement already satisfied: altair<6.0,>=4.2.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (5.3.0)\n",
|
| 41 |
+
"Requirement already satisfied: fastapi in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.110.2)\n",
|
| 42 |
+
"Requirement already satisfied: ffmpy in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.3.2)\n",
|
| 43 |
+
"Requirement already satisfied: gradio-client==0.15.1 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.15.1)\n",
|
| 44 |
+
"Requirement already satisfied: httpx>=0.24.1 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.27.0)\n",
|
| 45 |
+
"Requirement already satisfied: huggingface-hub>=0.19.3 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.22.2)\n",
|
| 46 |
+
"Requirement already satisfied: importlib-resources<7.0,>=1.3 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (6.4.0)\n",
|
| 47 |
+
"Requirement already satisfied: jinja2<4.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (3.1.2)\n",
|
| 48 |
+
"Requirement already satisfied: markupsafe~=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (2.1.1)\n",
|
| 49 |
+
"Requirement already satisfied: matplotlib~=3.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (3.7.2)\n",
|
| 50 |
+
"Requirement already satisfied: numpy~=1.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (1.24.3)\n",
|
| 51 |
+
"Requirement already satisfied: orjson~=3.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (3.10.1)\n",
|
| 52 |
+
"Requirement already satisfied: packaging in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (23.2)\n",
|
| 53 |
+
"Requirement already satisfied: pandas<3.0,>=1.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (2.0.3)\n",
|
| 54 |
+
"Requirement already satisfied: pillow<11.0,>=8.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (10.0.1)\n",
|
| 55 |
+
"Requirement already satisfied: pydantic>=2.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (2.7.1)\n",
|
| 56 |
+
"Requirement already satisfied: pydub in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.25.1)\n",
|
| 57 |
+
"Requirement already satisfied: python-multipart>=0.0.9 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.0.9)\n",
|
| 58 |
+
"Requirement already satisfied: pyyaml<7.0,>=5.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from gradio) (6.0)\n",
|
| 59 |
+
"Requirement already satisfied: ruff>=0.2.2 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.4.1)\n",
|
| 60 |
+
"Requirement already satisfied: semantic-version~=2.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (2.10.0)\n",
|
| 61 |
+
"Requirement already satisfied: tomlkit==0.12.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.12.0)\n",
|
| 62 |
+
"Requirement already satisfied: typer<1.0,>=0.12 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.12.3)\n",
|
| 63 |
+
"Requirement already satisfied: typing-extensions~=4.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (4.11.0)\n",
|
| 64 |
+
"Requirement already satisfied: urllib3~=2.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (2.2.1)\n",
|
| 65 |
+
"Requirement already satisfied: uvicorn>=0.14.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio) (0.29.0)\n",
|
| 66 |
+
"Requirement already satisfied: fsspec in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio-client==0.15.1->gradio) (2024.3.1)\n",
|
| 67 |
+
"Requirement already satisfied: websockets<12.0,>=10.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from gradio-client==0.15.1->gradio) (11.0.3)\n",
|
| 68 |
+
"Requirement already satisfied: jsonschema>=3.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from altair<6.0,>=4.2.0->gradio) (4.17.3)\n",
|
| 69 |
+
"Requirement already satisfied: toolz in c:\\programdata\\anaconda3\\lib\\site-packages (from altair<6.0,>=4.2.0->gradio) (0.12.0)\n",
|
| 70 |
+
"Requirement already satisfied: anyio in c:\\programdata\\anaconda3\\lib\\site-packages (from httpx>=0.24.1->gradio) (3.5.0)\n",
|
| 71 |
+
"Requirement already satisfied: certifi in c:\\programdata\\anaconda3\\lib\\site-packages (from httpx>=0.24.1->gradio) (2023.11.17)\n",
|
| 72 |
+
"Requirement already satisfied: httpcore==1.* in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from httpx>=0.24.1->gradio) (1.0.5)\n",
|
| 73 |
+
"Requirement already satisfied: idna in c:\\programdata\\anaconda3\\lib\\site-packages (from httpx>=0.24.1->gradio) (3.4)\n",
|
| 74 |
+
"Requirement already satisfied: sniffio in c:\\programdata\\anaconda3\\lib\\site-packages (from httpx>=0.24.1->gradio) (1.2.0)\n",
|
| 75 |
+
"Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.14.0)\n",
|
| 76 |
+
"Requirement already satisfied: filelock in c:\\programdata\\anaconda3\\lib\\site-packages (from huggingface-hub>=0.19.3->gradio) (3.9.0)\n",
|
| 77 |
+
"Requirement already satisfied: requests in c:\\programdata\\anaconda3\\lib\\site-packages (from huggingface-hub>=0.19.3->gradio) (2.31.0)\n",
|
| 78 |
+
"Requirement already satisfied: tqdm>=4.42.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from huggingface-hub>=0.19.3->gradio) (4.65.0)\n",
|
| 79 |
+
"Requirement already satisfied: contourpy>=1.0.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from matplotlib~=3.0->gradio) (1.0.5)\n",
|
| 80 |
+
"Requirement already satisfied: cycler>=0.10 in c:\\programdata\\anaconda3\\lib\\site-packages (from matplotlib~=3.0->gradio) (0.11.0)\n",
|
| 81 |
+
"Requirement already satisfied: fonttools>=4.22.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from matplotlib~=3.0->gradio) (4.25.0)\n",
|
| 82 |
+
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from matplotlib~=3.0->gradio) (1.4.4)\n",
|
| 83 |
+
"Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from matplotlib~=3.0->gradio) (3.0.9)\n",
|
| 84 |
+
"Requirement already satisfied: python-dateutil>=2.7 in c:\\programdata\\anaconda3\\lib\\site-packages (from matplotlib~=3.0->gradio) (2.8.2)\n",
|
| 85 |
+
"Requirement already satisfied: pytz>=2020.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas<3.0,>=1.0->gradio) (2023.3.post1)\n",
|
| 86 |
+
"Requirement already satisfied: tzdata>=2022.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas<3.0,>=1.0->gradio) (2023.3)\n",
|
| 87 |
+
"Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from pydantic>=2.0->gradio) (0.6.0)\n",
|
| 88 |
+
"Requirement already satisfied: pydantic-core==2.18.2 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from pydantic>=2.0->gradio) (2.18.2)\n",
|
| 89 |
+
"Requirement already satisfied: click>=8.0.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from typer<1.0,>=0.12->gradio) (8.0.4)\n",
|
| 90 |
+
"Requirement already satisfied: shellingham>=1.3.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n",
|
| 91 |
+
"Requirement already satisfied: rich>=10.11.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from typer<1.0,>=0.12->gradio) (13.7.1)\n",
|
| 92 |
+
"Requirement already satisfied: starlette<0.38.0,>=0.37.2 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from fastapi->gradio) (0.37.2)\n",
|
| 93 |
+
"Requirement already satisfied: colorama in c:\\programdata\\anaconda3\\lib\\site-packages (from click>=8.0.0->typer<1.0,>=0.12->gradio) (0.4.6)\n",
|
| 94 |
+
"Requirement already satisfied: attrs>=17.4.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (22.1.0)\n",
|
| 95 |
+
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.18.0)\n",
|
| 96 |
+
"Requirement already satisfied: six>=1.5 in c:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio) (1.16.0)\n",
|
| 97 |
+
"Requirement already satisfied: markdown-it-py>=2.2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.2.0)\n",
|
| 98 |
+
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\logis\\appdata\\roaming\\python\\python311\\site-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.14.0)\n",
|
| 99 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\programdata\\anaconda3\\lib\\site-packages (from requests->huggingface-hub>=0.19.3->gradio) (2.0.4)\n",
|
| 100 |
+
"Requirement already satisfied: mdurl~=0.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.0)\n",
|
| 101 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"name": "stderr",
|
| 106 |
+
"output_type": "stream",
|
| 107 |
+
"text": [
|
| 108 |
+
"DEPRECATION: Loading egg at c:\\programdata\\anaconda3\\lib\\site-packages\\vboxapi-1.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..\n"
|
| 109 |
+
]
|
| 110 |
+
}
|
| 111 |
+
],
|
| 112 |
+
"source": [
|
| 113 |
+
"%pip install \\\n",
|
| 114 |
+
" python-dotenv==1.0.0 \\\n",
|
| 115 |
+
" gradio"
|
| 116 |
+
]
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"cell_type": "code",
|
| 120 |
+
"execution_count": null,
|
| 121 |
+
"metadata": {},
|
| 122 |
+
"outputs": [],
|
| 123 |
+
"source": []
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"cell_type": "code",
|
| 127 |
+
"execution_count": 5,
|
| 128 |
+
"metadata": {},
|
| 129 |
+
"outputs": [],
|
| 130 |
+
"source": [
|
| 131 |
+
"\n",
|
| 132 |
+
"import os\n",
|
| 133 |
+
"from dotenv import load_dotenv, find_dotenv\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"_ = load_dotenv(find_dotenv()) # read local .env file"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": 6,
|
| 141 |
+
"metadata": {},
|
| 142 |
+
"outputs": [],
|
| 143 |
+
"source": [
|
| 144 |
+
"from langchain_openai import ChatOpenAI\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"model = ChatOpenAI(model=\"gpt-4\")"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"cell_type": "code",
|
| 151 |
+
"execution_count": 7,
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"outputs": [],
|
| 154 |
+
"source": [
|
| 155 |
+
"from langchain_core.output_parsers import StrOutputParser\n",
|
| 156 |
+
"from langchain_core.prompts import ChatPromptTemplate\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"template = \"\"\"\n",
|
| 159 |
+
"You are a proffesional IELTS essay evaluator.\n",
|
| 160 |
+
"Your task is to evaluate an essay text that I provided below according to the International English Language Testing System (IELTS) format. Ensure the essay meets the requirements of an IELTS essay in terms of structure, length, coherence, and language proficiency. Then, grade the essay based on the IELTS scoring system and provide detailed feedback on each criterion.\n",
|
| 161 |
+
"More detailed evaluation instructions and the essay text are provided below:\n",
|
| 162 |
+
"\n",
|
| 163 |
+
"\n",
|
| 164 |
+
"### EVALUATION INSTRUCTIONS:\n",
|
| 165 |
+
"1. Check if the essay conforms to the standard IELTS essay format, including introduction, body paragraphs, and conclusion.\n",
|
| 166 |
+
"Ensure the essay is of appropriate length (minimum 250 words).\n",
|
| 167 |
+
"Evaluate whether the essay addresses the given topic comprehensively and logically.\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"- Task Response (TR): Evaluate how well the essay addresses the task, presents a clear position, and supports it with relevant examples and arguments.\n",
|
| 170 |
+
"- Coherence and Cohesion (CC): Assess the organization of ideas, logical progression, and coherence between paragraphs and sentences.\n",
|
| 171 |
+
"- Lexical Resource (LR): Examine the range and accuracy of vocabulary used, including the ability to paraphrase and express ideas precisely.\n",
|
| 172 |
+
"- Grammatical Range and Accuracy (GRA): Evaluate the variety and accuracy of grammatical structures used, including sentence structure and punctuation.\n",
|
| 173 |
+
"\n",
|
| 174 |
+
"2. Provide specific feedback for each criterion, highlighting strengths and areas for improvement.\n",
|
| 175 |
+
"Offer advice on how the essay can be enhanced to achieve a higher score in each category.\n",
|
| 176 |
+
"Suggest alternative vocabulary, sentence structures, or argumentative strategies where applicable.\n",
|
| 177 |
+
"\n",
|
| 178 |
+
"3. Strictly follow the response structure below, do not say something after you give the scores and feedback. If you have nothing to say, leave it blank. If the text is completely not related to an essay (e.g it is just one sentence) you may not follow the structure and point out the issue.\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"### RESPONSE STRUCTURE\n",
|
| 182 |
+
"Use only the following answer structure and do not include any personal opinions or biases in the evaluation:\n",
|
| 183 |
+
"\"Band X.X\n",
|
| 184 |
+
"Task Response: X\n",
|
| 185 |
+
"Coherence and Cohesion: X\n",
|
| 186 |
+
"Lexical Resource: X\n",
|
| 187 |
+
"Grammatical Range and Accuracy: X\n",
|
| 188 |
+
"Detailed Feedback: [Provide detailed feedback here]\"\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"### EXAMPLE RESPONSE:\n",
|
| 192 |
+
"\"\n",
|
| 193 |
+
"Band 7.5\n",
|
| 194 |
+
"Task Response: 7\n",
|
| 195 |
+
"Coherence and Cohesion: 8\n",
|
| 196 |
+
"Lexical Resource: 8\n",
|
| 197 |
+
"Grammatical Range and Accuracy: 7\n",
|
| 198 |
+
"The test taker presents a clear position at the outset and explores some ideas to support this. An\n",
|
| 199 |
+
"alternative position is also considered, but rejected. This is a strong response, but there is rather\n",
|
| 200 |
+
"too much emphasis on technology: other aspects of the proposition could also be considered,\n",
|
| 201 |
+
"e.g. less physical work and more sedentary work, greater reliance on cars meaning less\n",
|
| 202 |
+
"exercise, aging populations in some countries leading to more complex health issues. Ideas are\n",
|
| 203 |
+
"organised logically and there is a clear progression throughout the response, with good use of\n",
|
| 204 |
+
"cohesive devices and logical paragraphing. The response could perhaps be improved by\n",
|
| 205 |
+
"breaking down paragraphs 2 and 3. There is a wide range of vocabulary with good use of less\n",
|
| 206 |
+
"common items as well as evidence of higher level features, such as ‘softening’, e.g. ‘They tend\n",
|
| 207 |
+
"to’, ‘This appears to be’, and ‘might disagree’. Errors in spelling and word formation are rare.\n",
|
| 208 |
+
"There is also a variety of complex structures with frequent error-free sentences, though some\n",
|
| 209 |
+
"errors do occur and there is some overuse of rather short sentence forms.\n",
|
| 210 |
+
"\"\n",
|
| 211 |
+
"\n",
|
| 212 |
+
"### ESSAY TEXT:\n",
|
| 213 |
+
"{essay_text}\n",
|
| 214 |
+
"\"\"\"\n"
|
| 215 |
+
]
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"cell_type": "code",
|
| 219 |
+
"execution_count": 8,
|
| 220 |
+
"metadata": {},
|
| 221 |
+
"outputs": [],
|
| 222 |
+
"source": [
|
| 223 |
+
"essay_text = \"\"\"\n",
|
| 224 |
+
"Recently, there have been a lot of discussions about health and whether it is going to\n",
|
| 225 |
+
"improve or not. In my opinion, I think that people will become unhealthier in the future than\n",
|
| 226 |
+
"they are now.\n",
|
| 227 |
+
"There are many reasons that support the idea of people becoming unhealthy in the future.\n",
|
| 228 |
+
"Firstly, one reason is that of food. People tend to eat more fast food nowadays. They tend to\n",
|
| 229 |
+
"treat themselves with sweets and chocolate whenever they want. This appears to be\n",
|
| 230 |
+
"because people are busier now than they used to be. So, people don’t have a chance to\n",
|
| 231 |
+
"cook or even learn the art of cookery. Also, having a lot of unhealthy food can lead to obesity\n",
|
| 232 |
+
"and it could be a serious issue in the future. Another reason is that technology is developing\n",
|
| 233 |
+
"everyday. Young people enjoy buying new gadgets and the latest devices. This has a\n",
|
| 234 |
+
"negative impact on their health, especially when they enjoy video games. Spending long\n",
|
| 235 |
+
"hours looking at a screen can lead to bad eyesight and obesity as well. Yet another reason\n",
|
| 236 |
+
"is that laziness is a big issue. Different forms of exercise might disappear in the future\n",
|
| 237 |
+
"because people don’t like sports. Also, people prefer spending most of their time on the\n",
|
| 238 |
+
"internet and the internet is growing every single day.\n",
|
| 239 |
+
"Other people might disagree and say that health will improve in the future. They believe\n",
|
| 240 |
+
"that new sports and new ways to exercise will appear in the future. However, I don’t think it\n",
|
| 241 |
+
"can happen since the majority of people spend less time outdoors. Moreover, other people\n",
|
| 242 |
+
"believe that technology will try and help people improve their health. For example, there\n",
|
| 243 |
+
"have been some games released on the Wii console that makes people exercise but\n",
|
| 244 |
+
"technology is developing more in a negative way. For instance, many phone industries are\n",
|
| 245 |
+
"developing new applications everyday and today’s generation likes to follow every trend.\n",
|
| 246 |
+
"This prevents people to go outside to exercise. They like to spend more time on the internet\n",
|
| 247 |
+
"downloading new programmes or reading gossips about celebraties. This affects people’s\n",
|
| 248 |
+
"health badly.\n",
|
| 249 |
+
"In conclusion, I believe that people’s health is affected negatively by fast food, technology\n",
|
| 250 |
+
"and sports and it will be a problem in the future.\"\"\""
|
| 251 |
+
]
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"cell_type": "code",
|
| 255 |
+
"execution_count": 9,
|
| 256 |
+
"metadata": {},
|
| 257 |
+
"outputs": [],
|
| 258 |
+
"source": [
|
| 259 |
+
"prompt = ChatPromptTemplate.from_template(template)\n",
|
| 260 |
+
"output_parser = StrOutputParser()\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"chain = prompt | model | output_parser\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"result = chain.invoke({\"essay_text\": essay_text})"
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"cell_type": "code",
|
| 269 |
+
"execution_count": 10,
|
| 270 |
+
"metadata": {},
|
| 271 |
+
"outputs": [
|
| 272 |
+
{
|
| 273 |
+
"name": "stdout",
|
| 274 |
+
"output_type": "stream",
|
| 275 |
+
"text": [
|
| 276 |
+
"Band 6.5\n",
|
| 277 |
+
"Task Response: 6\n",
|
| 278 |
+
"Coherence and Cohesion: 7\n",
|
| 279 |
+
"Lexical Resource: 6\n",
|
| 280 |
+
"Grammatical Range and Accuracy: 7\n",
|
| 281 |
+
"Detailed Feedback: The essay does follow the standard IELTS format, with an introduction, body paragraphs, and conclusion, and it addresses the task relatively well. The author presents a clear position and supports it with some relevant examples. However, the argument could be more comprehensive and logically structured. The essay also slightly lacks depth in its exploration of the topic. \n",
|
| 282 |
+
"\n",
|
| 283 |
+
"The organization of ideas and logical progression is quite good, with each paragraph discussing a different aspect of the argument. However, some points are not developed fully and transition between ideas could be smoother. \n",
|
| 284 |
+
"\n",
|
| 285 |
+
"The range of vocabulary is adequate but could be more varied and precise. There are a few instances of awkward phrasing and incorrect word usage, such as \"celebraties\" instead of \"celebrities\". The author could benefit from using more academic language and formal expressions. \n",
|
| 286 |
+
"\n",
|
| 287 |
+
"The grammatical range is good, with a variety of sentence structures employed. However, there are a few minor errors and inaccuracies. The author could improve their score by proofreading their work for grammar and spelling mistakes and by using more complex, varied sentence structures. It would also be beneficial to avoid repetitions and aim for a more academic writing style. \n",
|
| 288 |
+
"\n",
|
| 289 |
+
"Overall, the essay is a satisfactory response to the task, but there is room for improvement in all areas. The author should work on developing their argument more fully, improving the flow of ideas, expanding their vocabulary, and polishing their grammar.\n"
|
| 290 |
+
]
|
| 291 |
+
}
|
| 292 |
+
],
|
| 293 |
+
"source": [
|
| 294 |
+
"print(result)"
|
| 295 |
+
]
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"cell_type": "code",
|
| 299 |
+
"execution_count": 20,
|
| 300 |
+
"metadata": {},
|
| 301 |
+
"outputs": [
|
| 302 |
+
{
|
| 303 |
+
"name": "stdout",
|
| 304 |
+
"output_type": "stream",
|
| 305 |
+
"text": [
|
| 306 |
+
"Running on local URL: http://127.0.0.1:7867\n",
|
| 307 |
+
"\n",
|
| 308 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"data": {
|
| 313 |
+
"text/html": [
|
| 314 |
+
"<div><iframe src=\"http://127.0.0.1:7867/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 315 |
+
],
|
| 316 |
+
"text/plain": [
|
| 317 |
+
"<IPython.core.display.HTML object>"
|
| 318 |
+
]
|
| 319 |
+
},
|
| 320 |
+
"metadata": {},
|
| 321 |
+
"output_type": "display_data"
|
| 322 |
+
}
|
| 323 |
+
],
|
| 324 |
+
"source": [
|
| 325 |
+
"import gradio as gr\n",
|
| 326 |
+
"\n",
|
| 327 |
+
"\n",
|
| 328 |
+
"def evaluate(type, text):\n",
|
| 329 |
+
" result = chain.invoke({\"essay_text\": text})\n",
|
| 330 |
+
" return result\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"demo = gr.Interface(\n",
|
| 333 |
+
" fn = evaluate,\n",
|
| 334 |
+
" inputs = [\n",
|
| 335 |
+
" gr.Radio([\"IELTS\", \"TOEFL\", \"General\"], label=\"Essay type\", info=\"What type of essay you have?\"),\n",
|
| 336 |
+
" gr.\n",
|
| 337 |
+
" gr.Textbox(label=\"Essay text\", placeholder=\"Enter your essay here...\", container=True)\n",
|
| 338 |
+
" ],\n",
|
| 339 |
+
" outputs = [\n",
|
| 340 |
+
" gr.Textbox(label=\"Results & Comments\")\n",
|
| 341 |
+
" ]\n",
|
| 342 |
+
")\n",
|
| 343 |
+
"\n",
|
| 344 |
+
"if __name__ == \"__main__\":\n",
|
| 345 |
+
" demo.launch()"
|
| 346 |
+
]
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"cell_type": "code",
|
| 350 |
+
"execution_count": null,
|
| 351 |
+
"metadata": {},
|
| 352 |
+
"outputs": [],
|
| 353 |
+
"source": []
|
| 354 |
+
}
|
| 355 |
+
],
|
| 356 |
+
"metadata": {
|
| 357 |
+
"kernelspec": {
|
| 358 |
+
"display_name": "base",
|
| 359 |
+
"language": "python",
|
| 360 |
+
"name": "python3"
|
| 361 |
+
},
|
| 362 |
+
"language_info": {
|
| 363 |
+
"codemirror_mode": {
|
| 364 |
+
"name": "ipython",
|
| 365 |
+
"version": 3
|
| 366 |
+
},
|
| 367 |
+
"file_extension": ".py",
|
| 368 |
+
"mimetype": "text/x-python",
|
| 369 |
+
"name": "python",
|
| 370 |
+
"nbconvert_exporter": "python",
|
| 371 |
+
"pygments_lexer": "ipython3",
|
| 372 |
+
"version": "3.11.5"
|
| 373 |
+
}
|
| 374 |
+
},
|
| 375 |
+
"nbformat": 4,
|
| 376 |
+
"nbformat_minor": 2
|
| 377 |
+
}
|
notebooks/essay_grading_01.ipynb
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "f510f96b-d3ff-45f2-bec4-9eff274ccfc3",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Imports"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "raw",
|
| 13 |
+
"id": "1ef3f8ca-eb80-4b5a-8934-744e43433a0b",
|
| 14 |
+
"metadata": {
|
| 15 |
+
"scrolled": true,
|
| 16 |
+
"vscode": {
|
| 17 |
+
"languageId": "raw"
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"source": [
|
| 21 |
+
"!pip install \\\n",
|
| 22 |
+
" python-dotenv==1.0.0 \\\n",
|
| 23 |
+
" openai==0.28.1 \\\n",
|
| 24 |
+
" langchain==0.0.316 \\\n",
|
| 25 |
+
" gradio==4.5.0"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 1,
|
| 31 |
+
"id": "34564aeb-b6c0-42b0-93a4-46156f255d31",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stderr",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"C:\\Users\\logis\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 39 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 40 |
+
]
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"source": [
|
| 44 |
+
"# System related imports\n",
|
| 45 |
+
"import os\n",
|
| 46 |
+
"from dotenv import load_dotenv, find_dotenv\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"# OpenAI related imports\n",
|
| 49 |
+
"import openai\n",
|
| 50 |
+
"from langchain.chat_models import ChatOpenAI\n",
|
| 51 |
+
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
| 52 |
+
"\n",
|
| 53 |
+
"# Agent imports\n",
|
| 54 |
+
"import logging\n",
|
| 55 |
+
"from typing import List\n",
|
| 56 |
+
"from pydantic import BaseModel, Field\n",
|
| 57 |
+
"from langchain.chains import LLMChain\n",
|
| 58 |
+
"from langchain.agents import AgentExecutor\n",
|
| 59 |
+
"from langchain.prompts import PromptTemplate\n",
|
| 60 |
+
"from langchain.prompts import ChatPromptTemplate\n",
|
| 61 |
+
"from langchain.prompts import MessagesPlaceholder\n",
|
| 62 |
+
"from langchain.memory import ConversationBufferMemory\n",
|
| 63 |
+
"from langchain.output_parsers import PydanticOutputParser\n",
|
| 64 |
+
"from langchain.schema.runnable import RunnablePassthrough\n",
|
| 65 |
+
"from langchain.retrievers.multi_query import MultiQueryRetriever\n",
|
| 66 |
+
"from langchain.tools.render import format_tool_to_openai_function\n",
|
| 67 |
+
"from langchain.agents.agent_toolkits import create_retriever_tool\n",
|
| 68 |
+
"from langchain.agents.format_scratchpad import format_to_openai_functions\n",
|
| 69 |
+
"from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"# Chat UI\n",
|
| 72 |
+
"import gradio as gr\n",
|
| 73 |
+
"from langchain.schema import AIMessage, HumanMessage"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "markdown",
|
| 78 |
+
"id": "1add021b-201f-47ad-b4e3-44ba810d02c2",
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"source": [
|
| 81 |
+
"# Enviromental variables"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "markdown",
|
| 86 |
+
"id": "07106937-17ff-4a22-8d27-5c79a8a83591",
|
| 87 |
+
"metadata": {},
|
| 88 |
+
"source": [
|
| 89 |
+
"Set up the APIs keys."
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "code",
|
| 94 |
+
"execution_count": 2,
|
| 95 |
+
"id": "3270c64b-5168-4085-b3d3-07a6ad1a2902",
|
| 96 |
+
"metadata": {},
|
| 97 |
+
"outputs": [],
|
| 98 |
+
"source": [
|
| 99 |
+
"_ = load_dotenv(find_dotenv()) # read local .env file\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"openai.api_key = os.environ['OPENAI_API_KEY']"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"cell_type": "code",
|
| 106 |
+
"execution_count": 9,
|
| 107 |
+
"id": "513e5fdc-c854-40f0-a6ba-c874bf80f288",
|
| 108 |
+
"metadata": {},
|
| 109 |
+
"outputs": [],
|
| 110 |
+
"source": [
|
| 111 |
+
"# Output parser will split the LLM result into a list of queries\n",
|
| 112 |
+
"class LineList(BaseModel):\n",
|
| 113 |
+
" # \"lines\" is the key (attribute name) of the parsed output\n",
|
| 114 |
+
" lines: List[str] = Field(description=\"Lines of text\")\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"class LineListOutputParser(PydanticOutputParser):\n",
|
| 118 |
+
" def __init__(self) -> None:\n",
|
| 119 |
+
" super().__init__(pydantic_object=LineList)\n",
|
| 120 |
+
"\n",
|
| 121 |
+
" def parse(self, text: str) -> LineList:\n",
|
| 122 |
+
" lines = text.strip().split(\"\\n\")\n",
|
| 123 |
+
" return LineList(lines=lines)\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"\n",
|
| 126 |
+
"output_parser = LineListOutputParser()\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"QUERY_PROMPT = PromptTemplate(\n",
|
| 129 |
+
" input_variables=[\"essay_text\"],\n",
|
| 130 |
+
" template=\"\"\"\n",
|
| 131 |
+
" Task Description:\n",
|
| 132 |
+
"Your task is to evaluate an essay text according to the International English Language Testing System (IELTS) format. Ensure the essay meets the requirements of an IELTS essay in terms of structure, length, coherence, and language proficiency. Then, grade the essay based on the IELTS scoring system and provide detailed feedback on each criterion.\n",
|
| 133 |
+
"\n",
|
| 134 |
+
"Essay Text:\n",
|
| 135 |
+
"{essay_text}\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"Instructions to the Language Model:\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"### Relevance to IELTS Format:\n",
|
| 140 |
+
"Check if the essay conforms to the standard IELTS essay format, including introduction, body paragraphs, and conclusion.\n",
|
| 141 |
+
"Ensure the essay is of appropriate length (minimum 250 words).\n",
|
| 142 |
+
"Evaluate whether the essay addresses the given topic comprehensively and logically.\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"### Grading Criteria:\n",
|
| 145 |
+
"- Task Response (TR): Evaluate how well the essay addresses the task, presents a clear position, and supports it with relevant examples and arguments.\n",
|
| 146 |
+
"- Coherence and Cohesion (CC): Assess the organization of ideas, logical progression, and coherence between paragraphs and sentences.\n",
|
| 147 |
+
"- Lexical Resource (LR): Examine the range and accuracy of vocabulary used, including the ability to paraphrase and express ideas precisely.\n",
|
| 148 |
+
"- Grammatical Range and Accuracy (GRA): Evaluate the variety and accuracy of grammatical structures used, including sentence structure and punctuation.\n",
|
| 149 |
+
"\n",
|
| 150 |
+
"### Detailed Comments and Advice:\n",
|
| 151 |
+
"Provide specific feedback for each criterion, highlighting strengths and areas for improvement.\n",
|
| 152 |
+
"Offer advice on how the essay can be enhanced to achieve a higher score in each category.\n",
|
| 153 |
+
"Suggest alternative vocabulary, sentence structures, or argumentative strategies where applicable.\n",
|
| 154 |
+
"\n",
|
| 155 |
+
"### Example Response:\n",
|
| 156 |
+
"\"\n",
|
| 157 |
+
"Band 7.5\n",
|
| 158 |
+
"Task Response: 7\n",
|
| 159 |
+
"Coherence and Cohesion: 8\n",
|
| 160 |
+
"Lexical Resource: 8\n",
|
| 161 |
+
"Grammatical Range and Accuracy: 7\n",
|
| 162 |
+
"The test taker presents a clear position at the outset and explores some ideas to support this. An\n",
|
| 163 |
+
"alternative position is also considered, but rejected. This is a strong response, but there is rather\n",
|
| 164 |
+
"too much emphasis on technology: other aspects of the proposition could also be considered,\n",
|
| 165 |
+
"e.g. less physical work and more sedentary work, greater reliance on cars meaning less\n",
|
| 166 |
+
"exercise, aging populations in some countries leading to more complex health issues. Ideas are\n",
|
| 167 |
+
"organised logically and there is a clear progression throughout the response, with good use of\n",
|
| 168 |
+
"cohesive devices and logical paragraphing. The response could perhaps be improved by\n",
|
| 169 |
+
"breaking down paragraphs 2 and 3. There is a wide range of vocabulary with good use of less\n",
|
| 170 |
+
"common items as well as evidence of higher level features, such as ‘softening’, e.g. ‘They tend\n",
|
| 171 |
+
"to’, ‘This appears to be’, and ‘might disagree’. Errors in spelling and word formation are rare.\n",
|
| 172 |
+
"There is also a variety of complex structures with frequent error-free sentences, though some\n",
|
| 173 |
+
"errors do occur and there is some overuse of rather short sentence forms.\n",
|
| 174 |
+
"\"\n",
|
| 175 |
+
"\"\"\",\n",
|
| 176 |
+
")\n",
|
| 177 |
+
"llm = ChatOpenAI(temperature=0)\n",
|
| 178 |
+
"\n",
|
| 179 |
+
"multi_query_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)"
|
| 180 |
+
]
|
| 181 |
+
}
|
| 182 |
+
],
|
| 183 |
+
"metadata": {
|
| 184 |
+
"kernelspec": {
|
| 185 |
+
"display_name": "Python 3 (ipykernel)",
|
| 186 |
+
"language": "python",
|
| 187 |
+
"name": "python3"
|
| 188 |
+
},
|
| 189 |
+
"language_info": {
|
| 190 |
+
"codemirror_mode": {
|
| 191 |
+
"name": "ipython",
|
| 192 |
+
"version": 3
|
| 193 |
+
},
|
| 194 |
+
"file_extension": ".py",
|
| 195 |
+
"mimetype": "text/x-python",
|
| 196 |
+
"name": "python",
|
| 197 |
+
"nbconvert_exporter": "python",
|
| 198 |
+
"pygments_lexer": "ipython3",
|
| 199 |
+
"version": "3.11.1"
|
| 200 |
+
}
|
| 201 |
+
},
|
| 202 |
+
"nbformat": 4,
|
| 203 |
+
"nbformat_minor": 5
|
| 204 |
+
}
|
notebooks/flagged/log.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Essay type,Essay text,Results & Comments,flag,username,timestamp
|
| 2 |
+
,,,,,2024-04-25 16:59:31.738799
|
pipeline.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Description: Pipeline running a model on user input.
|
| 2 |
+
# ML/pipeline.py
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import tensorflow as tf
|
| 7 |
+
from transformers import BertTokenizer, TFBertModel
|
| 8 |
+
|
| 9 |
+
class Pipeline:
|
| 10 |
+
"""Pipeline running a model on user input."""
|
| 11 |
+
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
"""Initializes the pipeline."""
|
| 14 |
+
|
| 15 |
+
# Configuration for the model
|
| 16 |
+
self.__config = {
|
| 17 |
+
"max_seq_length": 512,
|
| 18 |
+
"bert_model_name": "bert-base-uncased",
|
| 19 |
+
"model_type": "bert_text",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Initialize the BERT tokenizer
|
| 23 |
+
self.__bert_tokenizer = BertTokenizer.from_pretrained(
|
| 24 |
+
self.__config["bert_model_name"]
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Initialize the model
|
| 28 |
+
self.__model = self.__init_model()
|
| 29 |
+
|
| 30 |
+
def run(self, input_data: list[str]) -> float:
|
| 31 |
+
"""Runs the pipeline on the given input data.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
input_data: A list of strings.
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
A float representing the predicted value.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
# Preprocess the input data
|
| 41 |
+
input = self.__preprocessing(input_data)
|
| 42 |
+
|
| 43 |
+
# Make a prediction using the preprocessed data
|
| 44 |
+
result = self.__make_prediction(input)
|
| 45 |
+
|
| 46 |
+
return result
|
| 47 |
+
|
| 48 |
+
def __preprocessing(self, data: list[str]) -> np.ndarray:
|
| 49 |
+
"""Preprocesses the input data. Returns a numpy array of the preprocessed data."""
|
| 50 |
+
|
| 51 |
+
# Convert the data to a pandas DataFrame
|
| 52 |
+
df = pd.DataFrame({"text": data})
|
| 53 |
+
|
| 54 |
+
# Convert the 'text' column to a numpy array
|
| 55 |
+
input = np.array(df["text"])
|
| 56 |
+
|
| 57 |
+
# Tokenize the input using the BERT tokenizer
|
| 58 |
+
input_ids = self.__bert_tokenizer(
|
| 59 |
+
list(input), padding=True, truncation=True, return_tensors="tf", max_length=self.__config["max_seq_length"]
|
| 60 |
+
)["input_ids"]
|
| 61 |
+
|
| 62 |
+
# Pad the tokenized input to match the max sequence length
|
| 63 |
+
padded_ids = tf.pad(
|
| 64 |
+
input_ids, [[0, 0], [0, self.__config["max_seq_length"] - input_ids.shape[1]]]
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
return padded_ids
|
| 68 |
+
|
| 69 |
+
def __make_prediction(self, input: np.ndarray) -> float:
|
| 70 |
+
"""Makes a prediction using the model. Returns the prediction."""
|
| 71 |
+
|
| 72 |
+
# Make a prediction using the model
|
| 73 |
+
prediction = self.__model.predict(input)[0][0]
|
| 74 |
+
|
| 75 |
+
# Round the prediction to the nearest available value
|
| 76 |
+
result = self.__round_prediction(prediction)
|
| 77 |
+
|
| 78 |
+
return result
|
| 79 |
+
|
| 80 |
+
def __init_model(self) -> tf.keras.models.Model:
|
| 81 |
+
"""Initializes the model and loads the weights."""
|
| 82 |
+
|
| 83 |
+
# Load the BERT model
|
| 84 |
+
self.__bert_model = TFBertModel.from_pretrained(self.__config["bert_model_name"])
|
| 85 |
+
|
| 86 |
+
# Create a custom regression head for the model
|
| 87 |
+
regression_head = tf.keras.models.Sequential([
|
| 88 |
+
tf.keras.layers.Flatten(),
|
| 89 |
+
tf.keras.layers.Dense(128, activation="relu"),
|
| 90 |
+
tf.keras.layers.Dropout(0.3),
|
| 91 |
+
tf.keras.layers.Dense(64, activation="relu"),
|
| 92 |
+
tf.keras.layers.Dropout(0.3),
|
| 93 |
+
tf.keras.layers.Dense(1, activation="linear"),
|
| 94 |
+
])
|
| 95 |
+
|
| 96 |
+
# Combine BERT and Regression Head
|
| 97 |
+
input_ids = tf.keras.layers.Input(
|
| 98 |
+
shape=(self.__config["max_seq_length"],), dtype=tf.int32
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
bert_output = self.__bert_model(input_ids)[0] # BERT's output
|
| 102 |
+
|
| 103 |
+
pooler_output = bert_output[:, 0, :] # Pooler output
|
| 104 |
+
|
| 105 |
+
regression_output = regression_head(pooler_output) # Custom regression head
|
| 106 |
+
|
| 107 |
+
model = tf.keras.models.Model(inputs=input_ids, outputs=regression_output)
|
| 108 |
+
|
| 109 |
+
# Set BERT layers as non-trainable
|
| 110 |
+
for layer in self.__bert_model.layers:
|
| 111 |
+
layer.trainable = False
|
| 112 |
+
|
| 113 |
+
# Load the weights
|
| 114 |
+
model.load_weights("./app/ML/models/training_" + self.__config["model_type"] + "/cp.ckpt")
|
| 115 |
+
|
| 116 |
+
return model
|
| 117 |
+
|
| 118 |
+
def __round_prediction(self, value: float) -> float:
|
| 119 |
+
"""Rounds a given value to the nearest IELTS score."""
|
| 120 |
+
|
| 121 |
+
available_values = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0]
|
| 122 |
+
closest_value = min(available_values, key=lambda x: abs(x - value))
|
| 123 |
+
return closest_value
|
references/.gitkeep
ADDED
|
File without changes
|
reports/.gitkeep
ADDED
|
File without changes
|
reports/figures/.gitkeep
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# local package
|
| 2 |
+
-e .
|
| 3 |
+
|
| 4 |
+
# external requirements
|
| 5 |
+
click
|
| 6 |
+
Sphinx
|
| 7 |
+
coverage
|
| 8 |
+
awscli
|
| 9 |
+
flake8
|
| 10 |
+
python-dotenv>=0.5.1
|
| 11 |
+
aiofiles==23.2.1
|
| 12 |
+
aiohttp==3.9.5
|
| 13 |
+
aiosignal==1.3.1
|
| 14 |
+
altair==5.3.0
|
| 15 |
+
annotated-types==0.6.0
|
| 16 |
+
anyio==4.3.0
|
| 17 |
+
attrs==23.2.0
|
| 18 |
+
certifi==2024.2.2
|
| 19 |
+
charset-normalizer==3.3.2
|
| 20 |
+
click==8.1.7
|
| 21 |
+
colorama==0.4.6
|
| 22 |
+
contourpy==1.2.1
|
| 23 |
+
cycler==0.12.1
|
| 24 |
+
dataclasses-json==0.6.4
|
| 25 |
+
distro==1.9.0
|
| 26 |
+
fastapi==0.110.2
|
| 27 |
+
ffmpy==0.3.2
|
| 28 |
+
filelock==3.13.4
|
| 29 |
+
fonttools==4.51.0
|
| 30 |
+
frozenlist==1.4.1
|
| 31 |
+
fsspec==2024.3.1
|
| 32 |
+
gradio==4.28.2
|
| 33 |
+
gradio_client==0.16.0
|
| 34 |
+
greenlet==3.0.3
|
| 35 |
+
h11==0.14.0
|
| 36 |
+
httpcore==1.0.5
|
| 37 |
+
httpx==0.27.0
|
| 38 |
+
huggingface-hub==0.22.2
|
| 39 |
+
idna==3.7
|
| 40 |
+
importlib_resources==6.4.0
|
| 41 |
+
Jinja2==3.1.3
|
| 42 |
+
jsonpatch==1.33
|
| 43 |
+
jsonpointer==2.4
|
| 44 |
+
jsonschema==4.21.1
|
| 45 |
+
jsonschema-specifications==2023.12.1
|
| 46 |
+
kiwisolver==1.4.5
|
| 47 |
+
langchain-community==0.0.34
|
| 48 |
+
langchain-core==0.1.46
|
| 49 |
+
langchain-openai==0.1.3
|
| 50 |
+
langsmith==0.1.51
|
| 51 |
+
markdown-it-py==3.0.0
|
| 52 |
+
MarkupSafe==2.1.5
|
| 53 |
+
marshmallow==3.21.1
|
| 54 |
+
matplotlib==3.8.4
|
| 55 |
+
mdurl==0.1.2
|
| 56 |
+
multidict==6.0.5
|
| 57 |
+
mypy-extensions==1.0.0
|
| 58 |
+
numpy==1.26.4
|
| 59 |
+
openai==1.23.6
|
| 60 |
+
orjson==3.10.1
|
| 61 |
+
packaging==23.2
|
| 62 |
+
pandas==2.2.2
|
| 63 |
+
pillow==10.3.0
|
| 64 |
+
pydantic==2.7.1
|
| 65 |
+
pydantic_core==2.18.2
|
| 66 |
+
pydub==0.25.1
|
| 67 |
+
Pygments==2.17.2
|
| 68 |
+
pyparsing==3.1.2
|
| 69 |
+
python-dateutil==2.9.0.post0
|
| 70 |
+
python-dotenv==1.0.1
|
| 71 |
+
python-multipart==0.0.9
|
| 72 |
+
pytz==2024.1
|
| 73 |
+
PyYAML==6.0.1
|
| 74 |
+
referencing==0.35.0
|
| 75 |
+
regex==2024.4.16
|
| 76 |
+
requests==2.31.0
|
| 77 |
+
rich==13.7.1
|
| 78 |
+
rpds-py==0.18.0
|
| 79 |
+
ruff==0.4.2
|
| 80 |
+
semantic-version==2.10.0
|
| 81 |
+
shellingham==1.5.4
|
| 82 |
+
six==1.16.0
|
| 83 |
+
sniffio==1.3.1
|
| 84 |
+
SQLAlchemy==2.0.29
|
| 85 |
+
starlette==0.37.2
|
| 86 |
+
tenacity==8.2.3
|
| 87 |
+
tiktoken==0.6.0
|
| 88 |
+
tomlkit==0.12.0
|
| 89 |
+
toolz==0.12.1
|
| 90 |
+
tqdm==4.66.2
|
| 91 |
+
typer==0.12.3
|
| 92 |
+
typing-inspect==0.9.0
|
| 93 |
+
typing_extensions==4.11.0
|
| 94 |
+
tzdata==2024.1
|
| 95 |
+
urllib3==2.2.1
|
| 96 |
+
uvicorn==0.29.0
|
| 97 |
+
websockets==11.0.3
|
| 98 |
+
yarl==1.9.4
|
setup.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import find_packages, setup
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name='src',
|
| 5 |
+
packages=find_packages(),
|
| 6 |
+
version='0.1.0',
|
| 7 |
+
description='A short description of the project.',
|
| 8 |
+
author='Aleksandr Shishkov',
|
| 9 |
+
license='MIT',
|
| 10 |
+
)
|
src/__init__.py
ADDED
|
File without changes
|
src/chains/chain_creation.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv, find_dotenv
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
|
| 7 |
+
class Chain():
|
| 8 |
+
"""
|
| 9 |
+
Represents a chain for generating essays using a template and a language model.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
template (str): The template to use for generating the essay.
|
| 13 |
+
model (str, optional): The language model to use. Defaults to "gpt-4".
|
| 14 |
+
|
| 15 |
+
Attributes:
|
| 16 |
+
model (ChatOpenAI): The language model instance.
|
| 17 |
+
template (str): The template used for generating the essay.
|
| 18 |
+
|
| 19 |
+
Methods:
|
| 20 |
+
invoke: Invokes the chain to generate an essay.
|
| 21 |
+
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, template, model="gpt-4"):
|
| 25 |
+
_ = load_dotenv(find_dotenv())
|
| 26 |
+
self.model = ChatOpenAI(model=model)
|
| 27 |
+
self.template = template
|
| 28 |
+
|
| 29 |
+
def invoke(self, essay_topic, essay_text):
|
| 30 |
+
"""
|
| 31 |
+
Invokes the chain to generate an essay.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
essay_topic (str): The topic of the essay.
|
| 35 |
+
essay_text (str): The text of the essay.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
str: The generated essay.
|
| 39 |
+
|
| 40 |
+
"""
|
| 41 |
+
prompt = ChatPromptTemplate.from_template(self.template)
|
| 42 |
+
output_parser = StrOutputParser()
|
| 43 |
+
|
| 44 |
+
chain = prompt | self.model | output_parser
|
| 45 |
+
|
| 46 |
+
return chain.invoke({"essay_topic": essay_topic, "essay_text": essay_text})
|
| 47 |
+
|
src/chains/general.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .chain_creation import Chain
|
| 2 |
+
|
| 3 |
+
class General_Chain(Chain):
|
| 4 |
+
"""
|
| 5 |
+
A class representing the General Chain for essay evaluation.
|
| 6 |
+
|
| 7 |
+
This chain is responsible for evaluating an essay text based on specific criteria and providing detailed feedback.
|
| 8 |
+
|
| 9 |
+
Attributes:
|
| 10 |
+
template (str): The template for the evaluation instructions, response structure, essay topic, and essay text.
|
| 11 |
+
|
| 12 |
+
Methods:
|
| 13 |
+
__init__(): Initializes the General_Chain object.
|
| 14 |
+
get_general_template(): Returns the template for the evaluation instructions, response structure, essay topic, and essay text.
|
| 15 |
+
invoke(essay_topic, essay_text): Invokes the General_Chain and returns the evaluation result.
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
"""
|
| 21 |
+
Initializes the General_Chain object.
|
| 22 |
+
|
| 23 |
+
It sets the template attribute by calling the get_general_template() method and passes it to the parent class's __init__() method.
|
| 24 |
+
|
| 25 |
+
"""
|
| 26 |
+
self.template = self.get_general_template()
|
| 27 |
+
super().__init__(self.template)
|
| 28 |
+
|
| 29 |
+
def get_general_template(self):
|
| 30 |
+
"""
|
| 31 |
+
Returns the template for the evaluation instructions, response structure, essay topic, and essay text.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
str: The template string.
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
GENERAL_TEMPLATE = """
|
| 38 |
+
You are a professional essay evaluator.
|
| 39 |
+
Your task is to evaluate an essay text that I provided below. Ensure the essay meets the basic requirements in structure, length, coherence, and language proficiency. Then, provide detailed feedback on the essay.
|
| 40 |
+
The essay topic is optionally available below. The feedback should not be influenced by the availability of the topic. Use the topic only as a context for evaluation.
|
| 41 |
+
More detailed evaluation instructions and the essay text are provided below:
|
| 42 |
+
|
| 43 |
+
### EVALUATION INSTRUCTIONS:
|
| 44 |
+
1. Check if the essay conforms to the standard essay format, including introduction, body paragraphs, and conclusion.
|
| 45 |
+
Ensure the essay is of appropriate length (minimum 50 words).
|
| 46 |
+
Evaluate whether the essay addresses the given topic comprehensively and logically.
|
| 47 |
+
|
| 48 |
+
2. Provide specific feedback, highlighting strengths and areas for improvement.
|
| 49 |
+
Offer advice on how the essay can be enhanced to achieve a higher score in each category.
|
| 50 |
+
Suggest alternative vocabulary, sentence structures, or argumentative strategies where applicable.
|
| 51 |
+
|
| 52 |
+
3. Strictly follow the response structure below, do not say something after you give the scores and feedback. If you have nothing to say, leave it blank. If the text is completely not related to an essay (e.g it is just one sentence) you may not follow the structure and point out the issue.
|
| 53 |
+
|
| 54 |
+
### RESPONSE STRUCTURE
|
| 55 |
+
Use only the following answer structure and do not include any personal opinions or biases in the evaluation:
|
| 56 |
+
"
|
| 57 |
+
Feedback Summary: [Provide a brief summary of the feedback]
|
| 58 |
+
Detailed Feedback: [Provide detailed feedback here]
|
| 59 |
+
"
|
| 60 |
+
|
| 61 |
+
### ESSAY TOPIC (OPTIONAL):
|
| 62 |
+
{essay_topic}
|
| 63 |
+
|
| 64 |
+
### ESSAY TEXT:
|
| 65 |
+
{essay_text}
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
return GENERAL_TEMPLATE
|
| 69 |
+
|
| 70 |
+
def invoke(self, essay_topic, essay_text):
|
| 71 |
+
"""
|
| 72 |
+
Invokes the General_Chain and returns the evaluation result.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
essay_topic (str): The topic of the essay.
|
| 76 |
+
essay_text (str): The text of the essay.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
str: The evaluation result.
|
| 80 |
+
|
| 81 |
+
"""
|
| 82 |
+
return super().invoke(essay_topic, essay_text)
|
src/chains/ielts.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .chain_creation import Chain
|
| 2 |
+
|
| 3 |
+
class IELTS_Chain(Chain):
|
| 4 |
+
"""
|
| 5 |
+
A class representing an IELTS essay evaluation chain.
|
| 6 |
+
|
| 7 |
+
This chain is responsible for evaluating an essay text according to the International English Language Testing System (IELTS) format.
|
| 8 |
+
It checks if the essay conforms to the standard IELTS essay format, evaluates the essay based on the IELTS scoring system, and provides detailed feedback on each criterion.
|
| 9 |
+
|
| 10 |
+
Attributes:
|
| 11 |
+
template (str): The template for the IELTS evaluation instructions, response structure, example response, essay topic, and essay text.
|
| 12 |
+
|
| 13 |
+
Methods:
|
| 14 |
+
__init__(): Initializes the IELTS_Chain object.
|
| 15 |
+
get_ielts_template(): Returns the IELTS evaluation template.
|
| 16 |
+
invoke(essay_topic, essay_text): Invokes the IELTS evaluation chain with the given essay topic and essay text.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
"""
|
| 21 |
+
Initializes the IELTS_Chain object.
|
| 22 |
+
|
| 23 |
+
It sets the template attribute by calling the get_ielts_template() method and passes it to the parent class's __init__() method.
|
| 24 |
+
"""
|
| 25 |
+
self.template = self.get_ielts_template()
|
| 26 |
+
super().__init__(self.template)
|
| 27 |
+
|
| 28 |
+
def get_ielts_template(self):
|
| 29 |
+
"""
|
| 30 |
+
Returns the IELTS evaluation template.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
str: The IELTS evaluation template containing evaluation instructions, response structure, example response, essay topic, and essay text.
|
| 34 |
+
"""
|
| 35 |
+
IELTS_TEMPLATE = """
|
| 36 |
+
You are a professional IELTS essay evaluator.
|
| 37 |
+
Your task is to evaluate an essay text that I provided below according to the International English Language Testing System (IELTS) format. Ensure the essay meets the requirements of an IELTS essay in terms of structure, length, coherence, and language proficiency. Then, grade the essay based on the IELTS scoring system and provide detailed feedback on each criterion.
|
| 38 |
+
The essay topic is optionally available below. The grade should not be influenced by the availability of the topic. Use the topic only as a context for evaluation.
|
| 39 |
+
More detailed evaluation instructions and the essay text are provided below:
|
| 40 |
+
|
| 41 |
+
### EVALUATION INSTRUCTIONS:
|
| 42 |
+
...
|
| 43 |
+
### RESPONSE STRUCTURE:
|
| 44 |
+
...
|
| 45 |
+
### EXAMPLE RESPONSE:
|
| 46 |
+
...
|
| 47 |
+
### ESSAY TOPIC (OPTIONAL):
|
| 48 |
+
...
|
| 49 |
+
### ESSAY TEXT:
|
| 50 |
+
...
|
| 51 |
+
|
| 52 |
+
"""
|
| 53 |
+
return IELTS_TEMPLATE
|
| 54 |
+
|
| 55 |
+
def invoke(self, essay_topic, essay_text):
|
| 56 |
+
"""
|
| 57 |
+
Invokes the IELTS evaluation chain with the given essay topic and essay text.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
essay_topic (str): The topic of the essay.
|
| 61 |
+
essay_text (str): The text of the essay.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
str: The result of the IELTS evaluation chain.
|
| 65 |
+
"""
|
| 66 |
+
return super().invoke(essay_topic, essay_text)
|
src/chains/toefl.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .chain_creation import Chain
|
| 2 |
+
|
| 3 |
+
class TOEFL_Chain(Chain):
|
| 4 |
+
"""
|
| 5 |
+
A class representing a TOEFL essay evaluator chain.
|
| 6 |
+
|
| 7 |
+
This chain is responsible for evaluating an essay text according to the Test of English as a Foreign Language (TOEFL) format.
|
| 8 |
+
It checks if the essay conforms to the standard TOEFL essay format, evaluates the essay based on the TOEFL scoring system, and provides detailed feedback on each criterion.
|
| 9 |
+
|
| 10 |
+
Attributes:
|
| 11 |
+
template (str): The TOEFL evaluation template.
|
| 12 |
+
|
| 13 |
+
Methods:
|
| 14 |
+
__init__: Initializes the TOEFL_Chain object.
|
| 15 |
+
get_toefl_template: Returns the TOEFL evaluation template.
|
| 16 |
+
invoke: Invokes the TOEFL essay evaluation process.
|
| 17 |
+
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
"""
|
| 22 |
+
Initializes the TOEFL_Chain object.
|
| 23 |
+
|
| 24 |
+
It sets the template attribute by calling the get_toefl_template method and passes it to the parent class constructor.
|
| 25 |
+
|
| 26 |
+
"""
|
| 27 |
+
self.template = self.get_toefl_template()
|
| 28 |
+
super().__init__(self.template)
|
| 29 |
+
|
| 30 |
+
def get_toefl_template(self):
|
| 31 |
+
"""
|
| 32 |
+
Returns the TOEFL evaluation template.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
str: The TOEFL evaluation template.
|
| 36 |
+
|
| 37 |
+
"""
|
| 38 |
+
TOEFL_TEMPLATE = """
|
| 39 |
+
... (TOEFL evaluation template content)
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
return TOEFL_TEMPLATE
|
| 43 |
+
|
| 44 |
+
def invoke(self, essay_topic, essay_text):
|
| 45 |
+
"""
|
| 46 |
+
Invokes the TOEFL essay evaluation process.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
essay_topic (str): The topic of the essay.
|
| 50 |
+
essay_text (str): The text of the essay.
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
str: The evaluation result of the essay.
|
| 54 |
+
|
| 55 |
+
"""
|
| 56 |
+
return super().invoke(essay_topic, essay_text)
|
| 57 |
+
class TOEFL_Chain(Chain):
|
| 58 |
+
def __init__(self):
|
| 59 |
+
self.template = self.get_toefl_template()
|
| 60 |
+
super().__init__(self.template)
|
| 61 |
+
|
| 62 |
+
def get_toefl_template(self):
|
| 63 |
+
TOEFL_TEMPLATE = """
|
| 64 |
+
You are a professional TOEFL essay evaluator.
|
| 65 |
+
Your task is to evaluate an essay text that I provided below according to the Test of English as a Foreign Language (TOEFL) format. Ensure the essay meets the requirements of a TOEFL essay in terms of structure, length, coherence, and language proficiency. Then, grade the essay based on the TOEFL scoring system and provide detailed feedback on each criterion.
|
| 66 |
+
The essay topic is optionally available below. The grade should not be influenced by the availability of the topic. Use the topic only as a context for evaluation.
|
| 67 |
+
More detailed evaluation instructions and the essay text are provided below:
|
| 68 |
+
|
| 69 |
+
### EVALUATION INSTRUCTIONS:
|
| 70 |
+
1. Check if the essay conforms to the standard TOEFL essay format, including introduction, body paragraphs, and conclusion.
|
| 71 |
+
Ensure the essay is of appropriate length (minimum 300 words).
|
| 72 |
+
Evaluate whether the essay addresses the given topic comprehensively and logically.
|
| 73 |
+
|
| 74 |
+
2. Follow the grading system specified below and provide detailed feedback for each criterion, highlighting strengths and areas for improvement.
|
| 75 |
+
Offer advice on how the essay can be enhanced to achieve a higher score in each category.
|
| 76 |
+
Suggest alternative vocabulary, sentence structures, or argumentative strategies where applicable.
|
| 77 |
+
|
| 78 |
+
3. Strictly follow the response structure below, do not say something after you give the scores and feedback. If you have nothing to say, leave it blank. If the text is completely not related to an essay (e.g., it is just one sentence), you may not follow the structure and point out the issue.
|
| 79 |
+
|
| 80 |
+
### GRADING SYSTEM:
|
| 81 |
+
Score 5:
|
| 82 |
+
-Effectively addresses the topic and task.
|
| 83 |
+
-Is well organized and developed.
|
| 84 |
+
-Displays strong and consistent language skills with minimal errors.
|
| 85 |
+
Score 4:
|
| 86 |
+
-Addresses the topic and task well, though some points may not be fully elaborated.
|
| 87 |
+
-Is generally well organized and well developed.
|
| 88 |
+
-Occasional noticeable minor errors in structure, word form, or use of idiomatic language that do not interfere with meaning.
|
| 89 |
+
Score 3:
|
| 90 |
+
-Addresses the topic and task using somewhat developed explanations, exemplifications, and/or details.
|
| 91 |
+
-May demonstrate inconsistent facility in sentence formation and word choice that may result in lack of clarity and occasionally obscure meaning.
|
| 92 |
+
Score 2:
|
| 93 |
+
-Limited development in response to the topic and task.
|
| 94 |
+
-Lack of details to support or illustrate generalizations in response to the task.
|
| 95 |
+
-An accumulation of errors in sentence structure and/or usage.
|
| 96 |
+
Score 1:
|
| 97 |
+
-Little or no detail, or irrelevant specifics, or questionable responsiveness to the task.
|
| 98 |
+
-Serious and frequent errors in sentence structure or usage.
|
| 99 |
+
Score 0:
|
| 100 |
+
-Merely copies sentences from the reading, rejects the topic or is not connected to the topic, is written in a foreign language,or is blank.
|
| 101 |
+
|
| 102 |
+
### RESPONSE STRUCTURE
|
| 103 |
+
Use only the following answer structure and do not include any personal opinions or biases in the evaluation:
|
| 104 |
+
"Score: X
|
| 105 |
+
Key points: [Provide key points here]
|
| 106 |
+
Detailed Feedback: [Provide detailed feedback here]"
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
### EXAMPLE RESPONSE:
|
| 110 |
+
"
|
| 111 |
+
There are three key things this essay does that results in its high score, and each is explained in more detail below.
|
| 112 |
+
-Is well organized
|
| 113 |
+
-Uses specific examples
|
| 114 |
+
-Few grammatical/spelling errors
|
| 115 |
+
The essay, like the first one, is well organized. The writer’s position is clear within the first few sentences, and the rest of the essay elaborates on that position. Each paragraph begins with a new major point that is then explained. This logical flow of ideas is easy for readers to follow and shows that the writer knows how to set up a clear argument.
|
| 116 |
+
Another reason the essay received a top score is because the writer used specific examples to make her point. By using specific examples, such as a friend buying a new outfit and asking your opinion and phrases businesses use to sell products, the writer makes her argument stronger and more concrete.
|
| 117 |
+
Finally, despite the lack of capitalization throughout the essay, there are few spelling and grammatical errors, and the ones that do exist don’t detract from the meaning of the essay or make it confusing to understand. This shows a strong command of English and the ability to write in-depth essays that are clear and get their point across.
|
| 118 |
+
"
|
| 119 |
+
|
| 120 |
+
### ESSAY TOPIC (OPTIONAL):
|
| 121 |
+
{essay_topic}
|
| 122 |
+
|
| 123 |
+
### ESSAY TEXT:
|
| 124 |
+
{essay_text}
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
return TOEFL_TEMPLATE
|
| 128 |
+
|
| 129 |
+
def invoke(self, essay_topic, essay_text):
|
| 130 |
+
return super().invoke(essay_topic, essay_text)
|
src/data/.gitkeep
ADDED
|
File without changes
|
src/data/__init__.py
ADDED
|
File without changes
|
src/data/components/__init__.py
ADDED
|
File without changes
|
src/data/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from src.logging import logger
|
| 6 |
+
import urllib.request as request
|
| 7 |
+
from datasets import load_dataset
|
| 8 |
+
from src.data.entity import DataIngestionConfig
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DataIngestion:
|
| 12 |
+
def __init__(self, config: DataIngestionConfig):
|
| 13 |
+
self.config = config
|
| 14 |
+
|
| 15 |
+
def download_file(self):
|
| 16 |
+
if not os.path.exists(self.config.local_data_file):
|
| 17 |
+
raw_dataset = load_dataset(self.config.hf_dataset_name, split=self.config.hf_dataset_split)
|
| 18 |
+
df = pd.DataFrame(raw_dataset)
|
| 19 |
+
|
| 20 |
+
with zipfile.ZipFile(self.config.local_data_file, 'w') as z:
|
| 21 |
+
df.to_csv('raw_dataset.csv', index=False) # Save DataFrame to CSV file
|
| 22 |
+
z.write('raw_dataset.csv') # Write CSV file to the zip archive
|
| 23 |
+
|
| 24 |
+
os.remove('raw_dataset.csv') # Remove the temporary CSV file after zipping
|
| 25 |
+
logger.info(f"Dataset {self.config.hf_dataset_name} downloaded and archived as data.zip!")
|
| 26 |
+
else:
|
| 27 |
+
logger.info(f"File already exists. File size: {Path(self.config.local_data_file).stat().st_size}")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def extract_zip_file(self):
|
| 31 |
+
"""
|
| 32 |
+
zip_file_path: str
|
| 33 |
+
Extracts the zip file into the data directory
|
| 34 |
+
Function returns None
|
| 35 |
+
"""
|
| 36 |
+
unzip_path = self.config.unzip_dir
|
| 37 |
+
os.makedirs(unzip_path, exist_ok=True)
|
| 38 |
+
with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
|
| 39 |
+
zip_ref.extractall(unzip_path)
|
| 40 |
+
logger.info(f"Data extracted at {unzip_path}")
|
src/data/components/data_preprocessing.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from src.logging import logger
|
| 4 |
+
from datasets import Dataset
|
| 5 |
+
from src.data.entity import DataPreprocessingConfig
|
| 6 |
+
|
| 7 |
+
class DataPreprocessing:
|
| 8 |
+
def __init__(self, config: DataPreprocessingConfig):
|
| 9 |
+
self.config = config
|
| 10 |
+
|
| 11 |
+
def __form_finetuning_dataset(self, dataset_dict: dict, question_key: str, answer_key: str) -> Dataset:
|
| 12 |
+
instruction_template = """{question}"""
|
| 13 |
+
|
| 14 |
+
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 15 |
+
|
| 16 |
+
### Instruction:
|
| 17 |
+
|
| 18 |
+
{instruction}
|
| 19 |
+
|
| 20 |
+
### Response:\n"""
|
| 21 |
+
|
| 22 |
+
num_samples = len(dataset_dict[question_key])
|
| 23 |
+
finetuning_dataset_list = []
|
| 24 |
+
for i in range(num_samples):
|
| 25 |
+
question = dataset_dict[question_key][i]
|
| 26 |
+
instruction = instruction_template.format(question=question)
|
| 27 |
+
prompt = prompt_template.format(instruction=instruction)
|
| 28 |
+
response = dataset_dict[answer_key][i] + "\n### End"
|
| 29 |
+
text = prompt + response
|
| 30 |
+
finetuning_dataset_list.append({"instruction": instruction, "response": response, "text": text})
|
| 31 |
+
|
| 32 |
+
finetuning_dataset = Dataset.from_list(finetuning_dataset_list)
|
| 33 |
+
|
| 34 |
+
return finetuning_dataset
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def convert(self):
|
| 38 |
+
instruction_dataset_df = pd.read_csv(self.config.raw_data_path)
|
| 39 |
+
instruction_dataset_dict = instruction_dataset_df.to_dict()
|
| 40 |
+
logger.info("Successfully loaded the raw dataset")
|
| 41 |
+
|
| 42 |
+
finetuning_dataset = self.__form_finetuning_dataset(instruction_dataset_dict, question_key = self.config.question_key, answer_key = self.config.answer_key)
|
| 43 |
+
finetuning_dataset_df = finetuning_dataset.to_pandas()
|
| 44 |
+
finetuning_dataset_df.to_csv(os.path.join(self.config.root_dir,"finetuning_dataset.csv"), index=False)
|
| 45 |
+
logger.info("Successfully saved the finetuning dataset")
|
src/data/components/data_transformation.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import datasets
|
| 4 |
+
from src.logging import logger
|
| 5 |
+
from datasets import Dataset, DatasetDict
|
| 6 |
+
from src.data.entity import DataTransformationConfig
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class DataTransformation:
|
| 10 |
+
def __init__(self, config: DataTransformationConfig):
|
| 11 |
+
self.config = config
|
| 12 |
+
|
| 13 |
+
def __split_data(self, dataset: Dataset):
|
| 14 |
+
split_dataset = dataset.train_test_split(train_size=self.config.train_data_split, shuffle=True, seed=42)
|
| 15 |
+
test_dataset = split_dataset['test'].train_test_split(train_size=self.config.test_data_split, shuffle=True, seed=42)
|
| 16 |
+
|
| 17 |
+
dataset = DatasetDict({
|
| 18 |
+
'train' : split_dataset['train'],
|
| 19 |
+
'test' : test_dataset['train'],
|
| 20 |
+
'eval' : test_dataset['test'],
|
| 21 |
+
})
|
| 22 |
+
|
| 23 |
+
return dataset
|
| 24 |
+
|
| 25 |
+
def __transform_data(self, dataset: Dataset):
|
| 26 |
+
"""Transforms the data to the format required by the model"""
|
| 27 |
+
return dataset
|
| 28 |
+
|
| 29 |
+
def convert(self):
|
| 30 |
+
finetuning_dataset_df = pd.read_csv(self.config.finetuning_data_path)
|
| 31 |
+
finetuning_dataset = Dataset.from_pandas(finetuning_dataset_df)
|
| 32 |
+
logger.info("Successfully loaded the finetuning data")
|
| 33 |
+
|
| 34 |
+
transformed_dataset = self.__transform_data(finetuning_dataset)
|
| 35 |
+
logger.info("Successfully transformed the finetuning data")
|
| 36 |
+
|
| 37 |
+
splitted_dataset = self.__split_data(transformed_dataset)
|
| 38 |
+
train_dataset = splitted_dataset['train']
|
| 39 |
+
test_dataset = splitted_dataset['test']
|
| 40 |
+
eval_dataset = splitted_dataset['eval']
|
| 41 |
+
logger.info("Successfully splitted the data")
|
| 42 |
+
|
| 43 |
+
train_dataset_df = train_dataset.to_pandas()
|
| 44 |
+
test_dataset_df = test_dataset.to_pandas()
|
| 45 |
+
eval_dataset_df = eval_dataset.to_pandas()
|
| 46 |
+
train_dataset_df.to_csv(os.path.join(self.config.root_dir,"train_dataset.csv"), index=False)
|
| 47 |
+
test_dataset_df.to_csv(os.path.join(self.config.root_dir,"test_dataset.csv"), index=False)
|
| 48 |
+
eval_dataset_df.to_csv(os.path.join(self.config.root_dir,"eval_dataset.csv"), index=False)
|
| 49 |
+
logger.info("Successfully saved the transformed data")
|
src/data/configuration.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from src.utils.common import read_yaml
|
| 4 |
+
from dotenv import find_dotenv, load_dotenv
|
| 5 |
+
from src.data.entity import (DataIngestionConfig,
|
| 6 |
+
DataPreprocessingConfig,
|
| 7 |
+
DataTransformationConfig)
|
| 8 |
+
|
| 9 |
+
_ = load_dotenv(find_dotenv()) # read local .env file
|
| 10 |
+
|
| 11 |
+
DATA_CONFIG_FILE_PATH = os.environ['DATA_CONFIG_FILE_PATH']
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ConfigurationManager:
|
| 15 |
+
def __init__(self,
|
| 16 |
+
config_filepath = DATA_CONFIG_FILE_PATH):
|
| 17 |
+
|
| 18 |
+
self.config = read_yaml(Path(config_filepath))
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
| 22 |
+
config = self.config.data_ingestion
|
| 23 |
+
|
| 24 |
+
data_ingestion_config = DataIngestionConfig(
|
| 25 |
+
root_dir=config.root_dir,
|
| 26 |
+
hf_dataset_name=config.hf_dataset_name,
|
| 27 |
+
hf_dataset_split=config.hf_dataset_split,
|
| 28 |
+
local_data_file=config.local_data_file,
|
| 29 |
+
unzip_dir=config.unzip_dir
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
return data_ingestion_config
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
|
| 36 |
+
config = self.config.data_preprocessing
|
| 37 |
+
|
| 38 |
+
data_preprocessing_config = DataPreprocessingConfig(
|
| 39 |
+
root_dir=config.root_dir,
|
| 40 |
+
raw_data_path=config.raw_data_path,
|
| 41 |
+
question_key=config.question_key,
|
| 42 |
+
answer_key=config.answer_key
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
return data_preprocessing_config
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def get_data_transformation_config(self) -> DataTransformationConfig:
|
| 49 |
+
config = self.config.data_transformation
|
| 50 |
+
|
| 51 |
+
data_transformation_config = DataTransformationConfig(
|
| 52 |
+
root_dir=config.root_dir,
|
| 53 |
+
finetuning_data_path=config.finetuning_data_path,
|
| 54 |
+
train_data_split=config.train_data_split,
|
| 55 |
+
test_data_split=config.test_data_split,
|
| 56 |
+
eval_data_split=config.eval_data_split
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
return data_transformation_config
|
src/data/entity.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
@dataclass(frozen=True)
|
| 5 |
+
class DataIngestionConfig:
|
| 6 |
+
root_dir: Path
|
| 7 |
+
hf_dataset_name: str
|
| 8 |
+
hf_dataset_split: str
|
| 9 |
+
local_data_file: Path
|
| 10 |
+
unzip_dir: Path
|
| 11 |
+
|
| 12 |
+
@dataclass(frozen=True)
|
| 13 |
+
class DataPreprocessingConfig:
|
| 14 |
+
root_dir: Path
|
| 15 |
+
raw_data_path: Path
|
| 16 |
+
question_key: str
|
| 17 |
+
answer_key: str
|
| 18 |
+
|
| 19 |
+
@dataclass(frozen=True)
|
| 20 |
+
class DataTransformationConfig:
|
| 21 |
+
root_dir: Path
|
| 22 |
+
finetuning_data_path: Path
|
| 23 |
+
train_data_split: float
|
| 24 |
+
test_data_split: float
|
| 25 |
+
eval_data_split: float
|
src/data/make_dataset.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from src.logging import logger
|
| 5 |
+
|
| 6 |
+
from src.data.pipeline.stage_01_data_ingestion import DataIngestionPipeline
|
| 7 |
+
from src.data.pipeline.stage_02_data_preprocessing import DataPreprocessingPipeline
|
| 8 |
+
from src.data.pipeline.stage_03_data_transformation import DataTransformationPipeline
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def main():
|
| 12 |
+
""" Runs data processing scripts to turn raw data from (../raw) into
|
| 13 |
+
cleaned data ready to be analyzed (saved in ../processed).
|
| 14 |
+
"""
|
| 15 |
+
try:
|
| 16 |
+
logger.info('>>>>> Data Ingestion started <<<<<')
|
| 17 |
+
|
| 18 |
+
data_ingestion_pipeline = DataIngestionPipeline()
|
| 19 |
+
data_ingestion_pipeline.main()
|
| 20 |
+
|
| 21 |
+
logger.info('>>>>> Data Ingestion completed <<<<<')
|
| 22 |
+
|
| 23 |
+
except Exception as e:
|
| 24 |
+
logger.exception(e)
|
| 25 |
+
raise e
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
logger.info('>>>>> Data Preprocessing started <<<<<')
|
| 30 |
+
|
| 31 |
+
data_preprocessing_pipeline = DataPreprocessingPipeline()
|
| 32 |
+
data_preprocessing_pipeline.main()
|
| 33 |
+
|
| 34 |
+
logger.info('>>>>> Data Preprocessing completed <<<<<')
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.exception(e)
|
| 38 |
+
raise e
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
logger.info('>>>>> Data Transformation started <<<<<')
|
| 43 |
+
|
| 44 |
+
data_transformation_pipeline = DataTransformationPipeline()
|
| 45 |
+
data_transformation_pipeline.main()
|
| 46 |
+
|
| 47 |
+
logger.info('>>>>> Data Transformation completed <<<<<')
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.exception(e)
|
| 51 |
+
raise e
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if __name__ == '__main__':
|
| 55 |
+
|
| 56 |
+
os.chdir("../../")
|
| 57 |
+
|
| 58 |
+
main()
|
src/data/pipeline/__init__.py
ADDED
|
File without changes
|
src/data/pipeline/stage_01_data_ingestion.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.data.configuration import ConfigurationManager
|
| 2 |
+
from src.data.components.data_ingestion import DataIngestion
|
| 3 |
+
from src.logging import logger
|
| 4 |
+
|
| 5 |
+
class DataIngestionPipeline:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def main(self):
|
| 10 |
+
config = ConfigurationManager()
|
| 11 |
+
data_ingestion_config = config.get_data_ingestion_config()
|
| 12 |
+
data_ingestion = DataIngestion(config=data_ingestion_config)
|
| 13 |
+
data_ingestion.download_file()
|
| 14 |
+
data_ingestion.extract_zip_file()
|
| 15 |
+
|
| 16 |
+
if __name__ == "__main__":
|
| 17 |
+
try:
|
| 18 |
+
logger.info('>>>>> Data Ingestion started <<<<<')
|
| 19 |
+
|
| 20 |
+
data_ingestion_pipeline = DataIngestionPipeline()
|
| 21 |
+
data_ingestion_pipeline.main()
|
| 22 |
+
|
| 23 |
+
logger.info('>>>>> Data Ingestion completed <<<<<')
|
| 24 |
+
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.exception(e)
|
| 27 |
+
raise e
|
| 28 |
+
|
src/data/pipeline/stage_02_data_preprocessing.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.data.configuration import ConfigurationManager
|
| 2 |
+
from src.data.components.data_preprocessing import DataPreprocessing
|
| 3 |
+
from src.logging import logger
|
| 4 |
+
|
| 5 |
+
class DataPreprocessingPipeline:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def main(self):
|
| 10 |
+
config = ConfigurationManager()
|
| 11 |
+
data_preprocessing_config = config.get_data_preprocessing_config()
|
| 12 |
+
data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
|
| 13 |
+
data_preprocessing.convert()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
if __name__ == "__main__":
|
| 17 |
+
try:
|
| 18 |
+
logger.info('>>>>> Data Preprocessing started <<<<<')
|
| 19 |
+
|
| 20 |
+
data_preprocessing_pipeline = DataPreprocessingPipeline()
|
| 21 |
+
data_preprocessing_pipeline.main()
|
| 22 |
+
|
| 23 |
+
logger.info('>>>>> Data Preprocessing completed <<<<<')
|
| 24 |
+
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.exception(e)
|
| 27 |
+
raise e
|
| 28 |
+
|
| 29 |
+
|
src/data/pipeline/stage_03_data_transformation.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.data.configuration import ConfigurationManager
|
| 2 |
+
from src.data.components.data_transformation import DataTransformation
|
| 3 |
+
from src.logging import logger
|
| 4 |
+
|
| 5 |
+
class DataTransformationPipeline:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def main(self):
|
| 10 |
+
config = ConfigurationManager()
|
| 11 |
+
data_transformation_config = config.get_data_transformation_config()
|
| 12 |
+
data_transformation = DataTransformation(config=data_transformation_config)
|
| 13 |
+
data_transformation.convert()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
if __name__ == "__main__":
|
| 17 |
+
try:
|
| 18 |
+
logger.info('>>>>> Data Transformation started <<<<<')
|
| 19 |
+
|
| 20 |
+
data_transformation_pipeline = DataTransformationPipeline()
|
| 21 |
+
data_transformation_pipeline.main()
|
| 22 |
+
|
| 23 |
+
logger.info('>>>>> Data Transformation completed <<<<<')
|
| 24 |
+
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.exception(e)
|
| 27 |
+
raise e
|
src/essay_evaluation.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from chains.ielts import IELTS_Chain
|
| 2 |
+
from chains.toefl import TOEFL_Chain
|
| 3 |
+
from chains.general import General_Chain
|
| 4 |
+
|
| 5 |
+
class EssayEvaluation:
|
| 6 |
+
"""
|
| 7 |
+
A class that provides methods for evaluating essays based on different evaluation types.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
@staticmethod
|
| 11 |
+
def evaluate(type, essay_topic, essay_text):
|
| 12 |
+
"""
|
| 13 |
+
Evaluates an essay based on the specified type.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
type (str): The type of evaluation to perform. Possible values are "IELTS", "TOEFL", or "General".
|
| 17 |
+
essay_topic (str): The topic of the essay.
|
| 18 |
+
essay_text (str): The text of the essay.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
str: The evaluation result.
|
| 22 |
+
|
| 23 |
+
Raises:
|
| 24 |
+
None
|
| 25 |
+
"""
|
| 26 |
+
if type == "IELTS":
|
| 27 |
+
result = EssayEvaluation.evaluate_ielts(essay_topic, essay_text)
|
| 28 |
+
elif type == "TOEFL":
|
| 29 |
+
result = EssayEvaluation.evaluate_toefl(essay_topic, essay_text)
|
| 30 |
+
elif type == "General":
|
| 31 |
+
result = EssayEvaluation.evaluate_general(essay_topic, essay_text)
|
| 32 |
+
else:
|
| 33 |
+
result = "Invalid type selected."
|
| 34 |
+
return result
|
| 35 |
+
|
| 36 |
+
@staticmethod
|
| 37 |
+
def evaluate_ielts(essay_topic, essay_text):
|
| 38 |
+
"""
|
| 39 |
+
Evaluates an essay for IELTS.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
essay_topic (str): The topic of the essay.
|
| 43 |
+
essay_text (str): The text of the essay.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
str: The evaluation result.
|
| 47 |
+
|
| 48 |
+
Raises:
|
| 49 |
+
None
|
| 50 |
+
"""
|
| 51 |
+
return IELTS_Chain().invoke(essay_topic, essay_text)
|
| 52 |
+
|
| 53 |
+
@staticmethod
|
| 54 |
+
def evaluate_toefl(essay_topic, essay_text):
|
| 55 |
+
"""
|
| 56 |
+
Evaluates an essay for TOEFL.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
essay_topic (str): The topic of the essay.
|
| 60 |
+
essay_text (str): The text of the essay.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
str: The evaluation result.
|
| 64 |
+
|
| 65 |
+
Raises:
|
| 66 |
+
None
|
| 67 |
+
"""
|
| 68 |
+
return TOEFL_Chain().invoke(essay_topic, essay_text)
|
| 69 |
+
|
| 70 |
+
@staticmethod
|
| 71 |
+
def evaluate_general(essay_topic, essay_text):
|
| 72 |
+
"""
|
| 73 |
+
Evaluates an essay using a general evaluation method.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
essay_topic (str): The topic of the essay.
|
| 77 |
+
essay_text (str): The text of the essay.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
str: The evaluation result.
|
| 81 |
+
|
| 82 |
+
Raises:
|
| 83 |
+
None
|
| 84 |
+
"""
|
| 85 |
+
return General_Chain().invoke(essay_topic, essay_text)
|
src/features/.gitkeep
ADDED
|
File without changes
|
src/features/__init__.py
ADDED
|
File without changes
|
src/features/build_features.py
ADDED
|
File without changes
|
src/logger/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
|
| 6 |
+
log_dir = "logs"
|
| 7 |
+
log_filepath = os.path.join(log_dir,"running_logs.log")
|
| 8 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(
|
| 11 |
+
level= logging.INFO,
|
| 12 |
+
format= logging_str,
|
| 13 |
+
|
| 14 |
+
handlers=[
|
| 15 |
+
logging.FileHandler(log_filepath),
|
| 16 |
+
logging.StreamHandler(sys.stdout)
|
| 17 |
+
]
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger("main_logger")
|
src/main.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from essay_evaluation import EssayEvaluation
|
| 3 |
+
from logger import logger
|
| 4 |
+
|
| 5 |
+
demo = gr.Interface(
|
| 6 |
+
title="Essay Evaluation Assistant",
|
| 7 |
+
description="Master English with our AI Essay Assistant! Elevate your writing skills or prepare for IELTS and TOEFL. Get precise feedback to polish your skills and conquer every challenge!",
|
| 8 |
+
fn = EssayEvaluation.evaluate,
|
| 9 |
+
inputs = [
|
| 10 |
+
gr.Radio(["IELTS", "TOEFL", "General"], label="Essay type", info="What type of essay do you have?"),
|
| 11 |
+
gr.Textbox(label="Essay topic (optional)", placeholder="Enter your essay topic here...", container=True),
|
| 12 |
+
gr.Textbox(label="Essay text", placeholder="Enter your essay here...", container=True)
|
| 13 |
+
],
|
| 14 |
+
outputs = [
|
| 15 |
+
gr.Textbox(label="Results & Comments")
|
| 16 |
+
],
|
| 17 |
+
allow_flagging="never"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
demo.launch()
|
src/models/.gitkeep
ADDED
|
File without changes
|