Xtract Tables from any website

Pre-Requirements

import requests import pandas as pd from bs4 import BeautifulSoup

Installation

Enter:
nano xtract.py Paste:
import requests import pandas as pd from bs4 import BeautifulSoup import re import time import os from tqdm import tqdm import sys def clean_sheet_name(name): """Clean invalid characters from Excel sheet names""" return re.sub(r'[\\/*?:\[\]]', '', name)[:30] def manual_table_extraction(table): """Manually extract table data from BeautifulSoup table object""" # Extract headers headers = [] thead = table.find('thead') if thead: for th in thead.find_all('th'): headers.append(th.get_text(strip=True)) # Extract rows rows = [] tbody = table.find('tbody') if tbody: for row in tbody.find_all('tr'): cols = row.find_all('td') if cols: # Skip empty rows row_data = [col.get_text(strip=True) for col in cols] rows.append(row_data) # Create DataFrame if headers and rows: return pd.DataFrame(rows, columns=headers) elif rows: return pd.DataFrame(rows) else: return None def extract_tables_robust(url, table_class=None): """Reliable table extraction that works for complex sites""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", } try: response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') if table_class: tables = soup.find_all('table', class_=table_class) else: tables = soup.find_all('table') return tables except Exception as e: print(f"⚠️ Error fetching {url}: {str(e)}") return [] def main(): print("🔍 Robust Web Table Extractor") print("=" * 45) # Get user inputs base_url = input("\nEnter website URL pattern (use {} for page number):\n> ") num_pages = int(input("\nEnter number of pages to scrape:\n> ") or 1) filename = input("\nEnter output filename (e.g., data.csv or data.xlsx):\n> ") table_class = input("\nEnter table class (optional, press Enter to extract all tables):\n> ").strip() or None if not table_class: print("\n⚠️ Warning: Extracting ALL tables. This may result in messy output!") proceed = input("Continue? (y/n): ").lower() if proceed != 'y': print("Operation cancelled.") return all_data = [] page_range = range(1, num_pages + 1) if "{}" in base_url else [1] print(f"\n⏳ Scraping {len(page_range)} pages...") # Scrape pages with robust error handling for page in tqdm(page_range): try: url = base_url.format(page) if "{}" in base_url else base_url tables = extract_tables_robust(url, table_class) if not tables: print(f" ⚠️ No tables found on page {page}") continue for i, table in enumerate(tables): try: # Use manual extraction instead of pd.read_html() df = manual_table_extraction(table) if df is None or df.empty: print(f" ⚠️ Table {i+1} on page {page} is empty") continue # Add metadata df['Source_URL'] = url df['Table_Index'] = i + 1 if len(page_range) > 1: df['Page_Number'] = page all_data.append(df) print(f" ✅ Processed table {i+1} ({len(df)} rows)") except Exception as e: print(f" ⚠️ Error processing table {i+1}: {str(e)}") except Exception as e: print(f"🚨 Page {page} failed: {str(e)}") time.sleep(1.5) # Increased delay # Process and save results if not all_data: print("\n❌ No tables extracted. Possible issues:") print(" - Website requires JavaScript") print(" - Table class incorrect") print(" - Website blocking scrapers") return print(f"\n✅ Extracted {len(all_data)} tables with {sum(len(df) for df in all_data)} total rows") # Save results if filename.endswith('.csv'): combined = pd.concat(all_data, ignore_index=True) combined.to_csv(filename, index=False) print(f"💾 Saved CSV: {os.path.abspath(filename)}") elif filename.endswith('.xlsx'): with pd.ExcelWriter(filename) as writer: if table_class: # Combine tables when class is specified combined = pd.concat(all_data, ignore_index=True) combined.to_excel(writer, sheet_name="Combined", index=False) print(f"💾 Saved Excel with combined sheet: {os.path.abspath(filename)}") else: # Save each table separately for idx, df in enumerate(all_data): page_num = df['Page_Number'].iloc[0] if 'Page_Number' in df.columns else 1 sheet_name = f"Page{page_num}_Table{df['Table_Index'].iloc[0]}" sheet_name = clean_sheet_name(sheet_name) df.to_excel(writer, sheet_name=sheet_name, index=False) print(f"💾 Saved Excel with {len(all_data)} sheets: {os.path.abspath(filename)}") else: print("❌ Unsupported format. Use .csv or .xlsx") if __name__ == "__main__": main()
Press: ctrl + x, then yes

Usage

python xtract.py There are two types of sites. One having table spread across multiple pages and single spread website

Single spread:
https://en.m.wikipedia.org/wiki/Mobile_telephone_numbering_in_India

Multi spread:
https://www.kotaksecurities.com/margin-requirement/margin-trading/1
https://www.kotaksecurities.com/margin-requirement/margin-trading/2
https://www.kotaksecurities.com/margin-requirement/margin-trading/3


For single spread sites, just enter the site URL. Like this: https://en.m.wikipedia.org/wiki/Mobile_telephone_numbering_in_India

For Multi spread sites, instead of using last numbers, Use {}. Like this: https://www.kotaksecurities.com/margin-requirement/margin-trading/{}

Specify number of pages, set file name, and download the excel.

Use: termux-share filename to view or share the file