Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| st.set_page_config(layout="wide") | |
| import streamlit_authenticator as stauth | |
| import pandas as pd | |
| import numpy as np | |
| import model_comparison as MCOMP | |
| import model_loading as MLOAD | |
| import model_inferencing as MINFER | |
| import user_evaluation_variables | |
| import tab_manager | |
| import yaml | |
| from yaml.loader import SafeLoader | |
| from PIL import Image | |
| AUTHENTICATOR = None | |
| TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png') | |
| USER_LOGGED_IN = False | |
| USER_DATABASE_PATH = './data/user_database.yaml' | |
| def create_new_user(authenticator, users): | |
| try: | |
| if authenticator.register_user('Register user', preauthorization=False): | |
| st.success('User registered successfully') | |
| except Exception as e: | |
| st.error(e) | |
| with open(USER_DATABASE_PATH, 'w') as file: | |
| yaml.dump(users, file, default_flow_style=False) | |
| def forgot_password(authenticator, users): | |
| try: | |
| username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password( | |
| 'Forgot password') | |
| if username_of_forgotten_password: | |
| st.success('New password to be sent securely') | |
| # Random password should be transferred to user securely | |
| except Exception as e: | |
| st.error(e) | |
| with open(USER_DATABASE_PATH, 'w') as file: | |
| yaml.dump(users, file, default_flow_style=False) | |
| def update_account_details(authenticator, users): | |
| if st.session_state["authentication_status"]: | |
| try: | |
| if authenticator.update_user_details(st.session_state["username"], 'Update user details'): | |
| st.success('Entries updated successfully') | |
| except Exception as e: | |
| st.error(e) | |
| with open(USER_DATABASE_PATH, 'w') as file: | |
| yaml.dump(users, file, default_flow_style=False) | |
| def reset_password(authenticator, users): | |
| if st.session_state["authentication_status"]: | |
| try: | |
| if authenticator.reset_password(st.session_state["username"], 'Reset password'): | |
| st.success('Password modified successfully') | |
| except Exception as e: | |
| st.error(e) | |
| with open(USER_DATABASE_PATH, 'w') as file: | |
| yaml.dump(users, file, default_flow_style=False) | |
| def user_login_create(): | |
| global AUTHENTICATOR | |
| global TBYB_LOGO | |
| global USER_LOGGED_IN | |
| users = None | |
| with open(USER_DATABASE_PATH) as file: | |
| users = yaml.load(file, Loader=SafeLoader) | |
| AUTHENTICATOR = stauth.Authenticate( | |
| users['credentials'], | |
| users['cookie']['name'], | |
| users['cookie']['key'], | |
| users['cookie']['expiry_days'], | |
| users['preauthorized'] | |
| ) | |
| with st.sidebar: | |
| st.image(TBYB_LOGO, width=70) | |
| loginTab, registerTab, detailsTab = st.tabs(["Log in", "Register", "Account details"]) | |
| with loginTab: | |
| name, authentication_status, username = AUTHENTICATOR.login('Login', 'main') | |
| if authentication_status: | |
| AUTHENTICATOR.logout('Logout', 'main') | |
| st.write(f'Welcome *{name}*') | |
| user_evaluation_variables.USERNAME = username | |
| USER_LOGGED_IN = True | |
| elif authentication_status == False: | |
| st.error('Username/password is incorrect') | |
| forgot_password(AUTHENTICATOR, users) | |
| elif authentication_status == None: | |
| st.warning('Please enter your username and password') | |
| forgot_password(AUTHENTICATOR, users) | |
| if not authentication_status: | |
| with registerTab: | |
| create_new_user(AUTHENTICATOR, users) | |
| else: | |
| with detailsTab: | |
| st.write('**Username:** ', username) | |
| st.write('**Name:** ', name) | |
| st.write('**Email:** ', users['credentials']['usernames'][username]['email']) | |
| # update_account_details(AUTHENTICATOR, users) | |
| reset_password(AUTHENTICATOR, users) | |
| return USER_LOGGED_IN | |
| def setup_page_banner(): | |
| global USER_LOGGED_IN | |
| # for tab in [tab1, tab2, tab3, tab4, tab5]: | |
| c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9) | |
| with c5: | |
| st.image(TBYB_LOGO, use_column_width=True) | |
| for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]: | |
| col = None | |
| st.title('Try Before You Bias (TBYB)') | |
| st.write('*A Quantitative T2I Bias Evaluation Tool*') | |
| def setup_how_to(): | |
| expander = st.expander("How to Use") | |
| expander.write("1. Login to your TBYB Account using the bar on the right\n" | |
| "2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n") | |
| expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png')) | |
| expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n") | |
| expander.image(Image.open('./assets/lykon_corgi.png')) | |
| expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs " | |
| " to evaluate your model once it has been loaded\n" | |
| "5. Once you have generated some evaluation images, head over to the '\U0001F4C1 Generated Images' tab to have a look at them\n" | |
| "6. To check out your evaluations or all of the TBYB Community evaluations, head over to the '\U0001F4CA Model Comparison' tab\n" | |
| "7. For more information about the evaluation process, see our paper at --PAPER HYPERLINK-- or navigate to the " | |
| " '\U0001F4F0 Additional Information' tab for a TL;DR.\n" | |
| "8. For any questions or to report any bugs/issues. Please contact jordan.vice@uwa.edu.au.\n") | |
| def setup_additional_information_tab(tab): | |
| with tab: | |
| st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models") | |
| st.markdown( | |
| """ | |
| *Based on the article of the same name available here --PAPER HYPERLINK-- | |
| Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian | |
| This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical | |
| implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from | |
| all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope | |
| that others share their evaluations as we look to further the discussion on transparency and reliability | |
| of T2I models. | |
| """) | |
| st.header('2. A (very) Brief Summary') | |
| st.image(Image.open('./assets/TBYB_flowchart.png')) | |
| st.markdown( | |
| """ | |
| Bias in text-to-image models can propagate unfair social representations and could be exploited to | |
| aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation | |
| methods focused on social biases. So, we proposed a bias evaluation methodology that considered | |
| general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result. | |
| """ | |
| ) | |
| st.markdown( | |
| """ | |
| We proposed three novel metrics to quantify T2I model biases: | |
| 1. Distribution Bias - $B_D$ | |
| 2. Jaccard Hallucination - $H_J$ | |
| 3. Generative Miss Rate - $M_G$ | |
| Open the appropriate drop-down menu to understand the logic and inspiration behind metric. | |
| """ | |
| ) | |
| c1,c2,c3 = st.columns(3) | |
| with c1: | |
| with st.expander("Distribution Bias - $B_D$"): | |
| st.markdown( | |
| """ | |
| Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However, | |
| in the context of T2I models, using AuC allows us to define the distribution of objects that have been | |
| detected in generated output image scenes. | |
| So, everytime an object is detected in a scene, we update a dictionary (which is available for | |
| download after running an evaluation). After evaluating a full set of images, you can use this | |
| information to determine what objects appear more frequently than others. | |
| After all images are evaluated, we sort the objects in descending order and normalize the data. We | |
| then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.: | |
| $B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$ | |
| So, if a user conducts a task-oriented study on biases related to **dogs** using a model | |
| that was heavily biased using pictures of animals in the wild. You might find that after running | |
| evaluations, the most common objects detected were trees and grass - even if these objects weren't | |
| specified in the prompt. This would result in a very low $B_D$ in comparison to a model that for | |
| example was trained on images of dogs and animals in various different scenarios $\\rightarrow$ | |
| which would result in a *higher* $B_D$ in comparison. | |
| """ | |
| ) | |
| with c2: | |
| with st.expander("Jaccard Hallucination - $H_J$"): | |
| st.markdown( | |
| """ | |
| Hallucination is a very common phenomena that is discussed in relation to generative AI, particularly | |
| in relation to some of the most popular large language models. Depending on where you look, hallucinations | |
| can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment | |
| that we echo in our bias evaluations. | |
| Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a | |
| T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were | |
| specified. This indicates that there could be an innate shift in bias in the model, causing it to | |
| add or omit certain objects. | |
| Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of | |
| hallucination. Then, we considered the Jaccard similarity coefficient, which | |
| measures the similarity *and* diversity of two sets of objects/samples - defining this as | |
| Jaccard Hallucination - $H_J$. | |
| Simply put, we define the set of objects detected in the input prompt and then detect the objects in | |
| the corresponding output image. Then, we determine the intersect over union. For a model, we | |
| calculate the average $H_J$ across generated images using: | |
| $H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$ | |
| """ | |
| ) | |
| with c3: | |
| with st.expander("Generative Miss Rate - $M_G$"): | |
| st.markdown( | |
| """ | |
| Whenever fairness and trust are discussed in the context of machine learning and AI systems, | |
| performance is always highlighted as a key metric - regardless of the downstream task. So, in terms | |
| of evaluating bias, we thought that it would be important to see if there was a correlation | |
| between bias and performance (as we predicted). And while the other metrics do evaluate biases | |
| in terms of misalignment, they do not consider the relationship between bias and performance. | |
| We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically, | |
| as a model becomes more biased, it will begin to diverge away from the intended target and so, the | |
| miss rate of the generative model will increase as a result. This was a major consideration when | |
| designing this metric. | |
| We use the CLIP model as a binary classifier, differentiating between two classes: | |
| - the prompt used to generate the image | |
| - **NOT** the prompt | |
| Through our experiments on intentionally-biased T2I models, we found that there was a clear | |
| relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer | |
| how badly model performances have been affected by their biases. | |
| """ | |
| ) | |
| st.header('3. TBYB Constraints') | |
| st.markdown( | |
| """ | |
| While we have attempted to design a comprehensive, automated bias evaluation tool. We must acknowledge that | |
| in its infancy, TBYB has some constraints: | |
| - We have not checked the validity of *every* single T2I model and model type on HuggingFace so we cannot | |
| promise that all T2I models will work - if you run into any issues that you think should be possible, feel | |
| free to reach out! | |
| - Currently, a model_index.json file is required to load models and use them with TBYB, we will look to | |
| address other models in future works | |
| - TBYB only works on T2I models hosted on HuggingFace, other model repositories are not currently supported | |
| - Adaptor models are not currently supported, we will look to add evaluation functionalities of these | |
| models in the future. | |
| - Download, generation, inference and evaluation times are all hardware dependent. | |
| Keep in mind that these constraints may be removed or added to any time. | |
| """) | |
| st.header('4. Misuse, Malicious Use, and Out-of-Scope Use') | |
| st.markdown( | |
| """ | |
| Given this application is used for the assessment of T2I biases and relies on | |
| pre-trained models available on HuggingFace, we are not responsible for any content generated | |
| by public-facing models that have been used to generate images using this application. | |
| TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output | |
| insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or | |
| representations of marginalised groups, please address your concerns to the model providers. | |
| However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be | |
| beneficial to the TBYB community to share evaluations of biased T2I models! | |
| We share no association with HuggingFace \U0001F917, we only use their services as a model repository, | |
| given their growth in popularity in the computer science community recently. | |
| For further questions/queries or if you want to simply strike a conversation, | |
| please reach out to Jordan Vice at: jordan.vice@uwa.edu.au""") | |
| setup_page_banner() | |
| setup_how_to() | |
| if user_login_create(): | |
| tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.", | |
| "\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"]) | |
| setup_additional_information_tab(tab6) | |
| # PLASTER THE LOGO EVERYWHERE | |
| tab2.subheader("General Bias Evaluation") | |
| tab2.write("Waiting for \U0001F527 Setup to be complete...") | |
| tab3.subheader("Task-Oriented Bias Evaluation") | |
| tab3.write("Waiting for \U0001F527 Setup to be complete...") | |
| tab4.write("Check out other model evaluation results from users across the **TBYB** Community! \U0001F30E ") | |
| tab4.write("You can also just compare your own model evaluations by clicking the '*Personal Evaluation*' buttons") | |
| MCOMP.initialise_page(tab4) | |
| tab5.subheader("Generated Images from General and Task-Oriented Bias Evaluations") | |
| tab5.write("Waiting for \U0001F527 Setup to be complete...") | |
| with tab1: | |
| with st.form("model_definition_form", clear_on_submit=True): | |
| modelID = st.text_input('Input the HuggingFace \U0001F917 T2I model_id for the model you ' | |
| 'want to analyse e.g.: "runwayml/stable-diffusion-v1-5"') | |
| submitted1 = st.form_submit_button("Submit") | |
| if modelID: | |
| with st.spinner('Checking if ' + modelID + ' is valid and downloading it (if required)'): | |
| modelLoaded = MLOAD.check_if_model_exists(modelID) | |
| if modelLoaded is not None: | |
| # st.write("Located " + modelID + " model_index.json file") | |
| st.write("Located " + modelID) | |
| modelType = MLOAD.get_model_info(modelLoaded) | |
| if modelType is not None: | |
| st.write("Model is of Type: ", modelType) | |
| if submitted1: | |
| MINFER.TargetModel = MLOAD.import_model(modelID, modelType) | |
| if MINFER.TargetModel is not None: | |
| st.write("Text-to-image pipeline looks like this:") | |
| st.write(MINFER.TargetModel) | |
| user_evaluation_variables.MODEL = modelID | |
| user_evaluation_variables.MODEL_TYPE = modelType | |
| else: | |
| st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.' | |
| ' Please check that that HuggingFace repo ID is valid.' | |
| ' For more help, please see the "How to Use" Tab above.', icon="🚨") | |
| if modelID: | |
| with st.form("example_image_gen_form", clear_on_submit=True): | |
| testPrompt = st.text_input('Input a random test prompt to test out your ' | |
| 'chosen model and see if its generating images:') | |
| submitted2 = st.form_submit_button("Submit") | |
| if testPrompt and submitted2: | |
| with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"): | |
| testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt) | |
| st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt) | |
| st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias! | |
| Otherwise, feel free to load up a different model and run it again''') | |
| if MINFER.TargetModel is not None: | |
| tab_manager.completed_setup([tab2, tab3, tab4, tab5], modelID) | |
| else: | |
| MCOMP.databaseDF = None | |
| user_evaluation_variables.reset_variables('general') | |
| user_evaluation_variables.reset_variables('task-oriented') | |
| st.write('') | |
| st.warning('Log in or register your email to get started! ', icon="⚠️") |