Projects

Projects#

This page shows stats about the Open Seeds projects

import matplotlib.pyplot as plt
import pandas as pd
from wordcloud import WordCloud

baseurl = "https://raw.githubusercontent.com/open-life-science/open-life-science.github.io/main/_data/artifacts/openseeds/"
project_df = (
    pd.read_csv(f"{baseurl}projects.csv", index_col=0, na_filter=False)
    .assign(
        participants=lambda df: df.participants.str.split(", "),
        participantNb=lambda df: df.participants.str.len(),
        mentors=lambda df: df.mentors.str.split(", "),
        keywords=lambda df: df.keywords.str.split(", "),
        cohort=lambda df: "OLS-" + df.cohort.astype(str),
    )
)

Number of projects over all cohorts

len(project_df)

Cohorts#

cohort_df = (
    project_df
    .groupby(by="cohort")
    .count()
    .drop(columns = ["participants", "mentors", "description", "keywords", "status", "collaboration", "participantNb"])
    .rename(columns = {"name": "Total"})
)

Aggregating statistic of number of projects per cohort

cohort_df.Total.describe()

count     8.000000
mean     29.750000
std       5.391793
min      20.000000
25%      26.750000
50%      30.500000
75%      33.250000
max      37.000000
Name: Total, dtype: float64

Mean number of projects per cohort

cohort_df.Total.median()

np.float64(30.5)

fig, ax = plt.subplots()
fig.set_dpi(300)
(cohort_df.Total
     .transpose()
     .plot.bar(ax=ax, color="#139D3D"))
plt.ylabel("Number of projects")
plt.xlabel("")

Text(0.5, 0, '')

../_images/6a149dd66749a6e85202391e124fa4807a5a21900ebb980a619f6df96663a5b9.png

Participants#

Aggregating statistic of the number of participants per projects

project_df.participantNb.describe()

count    238.000000
mean       1.756303
std        1.346766
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        8.000000
Name: participantNb, dtype: float64

Distribution of the number of participants per project

fig, ax = plt.subplots()
fig.set_dpi(300)
project_df.participantNb.plot.hist(
    bins=8, ax=ax, legend=False, color="#139D3D"
)
plt.xlabel('Number of participants per projects')

Text(0.5, 0, 'Number of participants per projects')

../_images/cafd23bcb70585df4c4e7ba70a026aa4898a0d86f81aac26b7cc6eb468722fb6.png

cohort_df = (project_df
    .drop(columns = ["name", "participants", "mentors", "description", "keywords", "status", "collaboration", "status", "graduation"])
    .groupby(by="cohort")
    .mean()
)
fig, ax = plt.subplots()
fig.set_dpi(300)
(cohort_df.participantNb
     .transpose()
     .plot.bar(ax=ax, color="#139D3D"))
plt.ylabel("Mean number of participants per project")
plt.xlabel("")

Text(0.5, 0, '')

../_images/8455539db976f9ff2e883e12c397d5c63634f53b49d45bf2211da2982905e34f.png

cohort_df = (project_df
    .drop(columns = ["name", "participants", "mentors", "description", "keywords", "status", "collaboration", "status", "graduation"])
    .groupby(by="cohort")
    .mean()
)
fig, ax = plt.subplots()
fig.set_dpi(300)
(cohort_df.participantNb
     .transpose()
     .plot.bar(ax=ax, color="#139D3D"))
plt.ylabel("Mean number of participants per project")
plt.xlabel("")

Text(0.5, 0, '')

Keywords#

keyword_df = (project_df
    .drop(columns = ["participantNb", "participants", "mentors", "description", "status", "cohort", "collaboration", "status", "graduation"])
    .explode("keywords")
    .assign(keywords=lambda df: df.keywords.str.capitalize())
    .replace("Community building", "Community")
    .replace("Research community", "Community")
    .replace("Ethics of ai", "Ethical AI")
    .replace("Ethical ai", "Ethical AI")
    .replace("Enviromental", "Environmental science")
    .replace("Equal opportunity", "Equality")
    .replace("Training", "Training and education")
    .replace("Education", "Training and education")
    .replace("Artificial intelligence", "AI")
    .replace("Ai", "AI")
    .replace("Fair", "FAIR")
    .replace("Open-source", "Open source")
    .replace("Open source software", "Open source")
    .replace("Opensource", "Open source")
    .replace("Os", "Open source")
    .replace("Open source projects", "Open source")
    .replace(" data science", "Data science")
    .replace("Visualisation", "Data visualisation")
    .replace("Next-generation sequencing", "Sequencing")
    .replace("Open educational resource", "Open education")
    .replace("Reproducible research", "Reproducibility")
    .replace("Data", "Data science")
    .replace("Open community", "Community")
    .groupby(by="keywords")
    .count()
    .rename(columns={"name": "Frequency"})
    .sort_values("Frequency", ascending=False)
)

keyword_df.head(10)

	Frequency
keywords
	63
Community	36
Training and education	20
Open science	19
Open source	18
Reproducibility	13
Data science	12
Machine learning	10
Bioinformatics	9
AI	9

frec = keyword_df.Frequency.to_dict()

wc = WordCloud(
    background_color="rgba(255, 255, 255, 0)",
    random_state=42,
    width=600,
    height=400,
)

wordcloud = wc.generate_from_frequencies(frec)

fig, ax = plt.subplots(figsize=(13, 5))
ax.imshow(wc)

plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

../_images/60ab90fe5cf4424f450ef57b4add60ee7f46825489048455f5183c9d3ecaaa87.png

Graduated / non graduated projects#

Percentage of graduated project

100 * len(project_df.query('status == "graduated"'))/len(project_df.status)

72.26890756302521

Percentage of graduated project within collaboration

  Cell In[16], line 1
    Percentage of graduated project within collaboration
               ^
SyntaxError: invalid syntax

100 * len(project_df.query('status == "graduated" and collaboration != ""'))/len(project_df.query('collaboration != ""'))

94.11764705882354

Projects that did not graduated#

non_graduated_project = (
    project_df.query('status != "graduated"')
    .drop(columns=["description", "keywords", "status", "graduation", "collaboration", "participantNb"])
)
non_graduated_project["participants"] = non_graduated_project["participants"].apply(lambda x: ", ".join(str(i) for i in x))
non_graduated_project["mentors"] = non_graduated_project["mentors"].apply(lambda x: ", ".join(str(i) for i in x))
non_graduated_project.to_csv("../results/openseeds/non_graduated_project.csv", sep="\t", index=False)

Project that did graduated but came back

project_df[project_df.name.duplicated(keep=False)]

	name	participants	mentors	description	cohort	keywords	status	graduation	participantNb
105	Bioinformatics Secondary school Outreach in Ni...	[Emmanuel Adamolekun]	[Meag Doherty]	Bioinformatics Secondary School Outreach (BSSO...	OLS-4	[outreach, secondary school outreach, training...			1
111	Hub23: An open source community and infrastruc...	[Lydia France, Luke Hare, Callum Mole]	[Renato Alves]	Binderhub is a service that allows users to sh...	OLS-4	[research community, technical development]			3
137	Bioinformatics Secondary school Outreach in Ni...	[Emmanuel Adamolekun]	[Meag Doherty]	Bioinformatics Secondary School Outreach (BSSO...	OLS-5	[Bioinformatics, Students, data analysis]			1
141	Hub23: An open source community and infrastruc...	[Callum Mole, Lydia France, Luke Hare]	[Renato Alves]	Binderhub is a service that allows users to sh...	OLS-5	[Open Source, Reproducibility, Community, Open...	graduated		3
171	An extensible notebook for open specimens	[Nicky Nicolson]	[Andrea Sánchez Tapia, Batool Almarzouq]	This project is developing a prototype “extens...	OLS-6	[biodiversity informatics, species description...			1
175	Bioinformatics Secondary school Outreach in Ni...	[Emmanuel Adamolekun]	[Michael Landi]	Bioinformatics Secondary School Outreach (BSSO...	OLS-6	[Bioinformatics, Students, data analysis]			1
208	An extensible notebook for open specimens	[Nicky Nicolson]	[Andrea Sánchez Tapia, Batool Almarzouq]	This project is developing a prototype “extens...	OLS-7	[biodiversity informatics, species description...	graduated	https://www.youtube.com/live/qcgrHXo1hGY	1