Scrapping data from a list of URLs with the REST API format

  Kiến thức lập trình

For some context, I have a list of gene names that have been converted to Uniprot’s REST URL API query format. I have written a python code that can request and extract the 1st result of the query but now I am trying to increase the speed of the program by trying to request multiple URLs at once. Is this possible? (I already tried the futures.concurrent module but it doesn’t seem to improve the speed that much since it isn’t “true multithreading” from what I understand.

Code here:

#Packages
import pandas as pd
import requests as req
import io
from IPython.display import display
import warnings
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from tqdm import tqdm
import time
import datetime

#Warnign silencer
warnings.filterwarnings("ignore", message="Workbook contains no default style")

#Grabbing the raw file

file_path = 'FindAllMarkers.xlsx'

raw_df = pd.read_excel(file_path)

# Display the top 10 rows of raw_df
print(raw_df.head(10))

#Split into a dictionary containing each cluster
print(f"Number of clusters: {raw_df['cluster'].nunique()} (including 0)")

clusterdfs_dict = {}
# Iterate over unique cluster numbers
for cluster_num in raw_df['cluster'].unique():
    # Create a DataFrame for each cluster number
    clusterdfs_dict[cluster_num] = raw_df[raw_df['cluster'] == cluster_num]

#Creating a dataframe with all the unqiue genes and other relavent info
unique_genes = raw_df['gene'].unique()
gene_info = pd.DataFrame()
total_genes = len(unique_genes)
processed_genes = 0
#Function for retrieving the search query result of a gene
## 9606 is human
skipped_genes = []

#Generate links
def get_UniPQuery_link(gene_name, tax_id='9606', file_format='xlsx', max_retries=3):
    url = f"https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cft_intramem%2Ccc_subcellular_location%2Cft_topo_dom%2Cft_transmem%2Ccc_function%2Clit_doi_id%2Clit_pubmed_id&format={file_format}&size=500&query=%28%28gene%3A{gene_name}%29+AND+%28taxonomy_id%3A{tax_id}%29%29"
    return url


#Code to generate Uniprot rest links
UP_links = []
for x in unique_genes:
    UP_links.append(get_UniPQuery_link(x))


start_time = time.time()
start_date_time = datetime.datetime.now()

skipped_genes = []
final_dataframe = pd.DataFrame()

def process_link(link):
    response = req.get(link)
    if response.status_code == 200:
        content = response.content
        excel_data = io.BytesIO(content)
        raw_file_df = pd.read_excel(excel_data)
        
        if raw_file_df.empty:
            skipped_genes.append(link)
            print(f"Empty Excel file for link: {link}. Skipping.")
        else:
            first_row = raw_file_df.iloc[[0]]
            return first_row
    else:
        skipped_genes.append(link)
        print(f"Failed to retrieve Excel file for link: {link}. Skipping.")

def process_links_subset(links_subset, progress):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_link, link): link for link in links_subset}
        for future in concurrent.futures.as_completed(futures):
            first_row = future.result()
            if first_row is not None:
                results.append(first_row)
            progress.update(1)  # Update progress bar
    
    return results

def process_all_links(links):
    results = []
    subset_size = (len(links) + 7) // 8  # Calculate size of each subset
    with tqdm(total=len(links)) as progress:
        for i in range(0, len(links), subset_size):
            subset = links[i:i+subset_size]
            subset_results = process_links_subset(subset, progress)
            results.extend(subset_results)
    
    return results

# Process all links in 8 subsets concurrently
processed_data = process_all_links(UP_links)

for data in processed_data:
    final_dataframe = final_dataframe.append(data)

print("Skipped genes:", skipped_genes)
gene_info = final_dataframe.copy()

gene_info.to_excel('./gene_info.xlsx', index=False, engine='openpyxl')
# Get the end system time
end_date_time = datetime.datetime.now()
end_time = time.time()
elapsed_time = end_time - start_time
# Convert total seconds to hours, minutes, and seconds
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)
seconds = int(elapsed_time % 60)
# Display the start and end system date and time
print(f"Start Date and Time: {start_date_time}")
print(f"End Date and Time: {end_date_time}")
print(f"Execution time: {hours} hours, {minutes} minutes, {seconds} seconds")
print(f"These are the skipped genes: {skipped_genes}")

New contributor

AJ Jovellano is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

LEAVE A COMMENT