Pre-Requirements
import requests
import pandas as pd
from bs4 import BeautifulSoup
Installation
Enter:nano xtract.py
Paste:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import os
from tqdm import tqdm
import sys
def clean_sheet_name(name):
"""Clean invalid characters from Excel sheet names"""
return re.sub(r'[\\/*?:\[\]]', '', name)[:30]
def manual_table_extraction(table):
"""Manually extract table data from BeautifulSoup table object"""
# Extract headers
headers = []
thead = table.find('thead')
if thead:
for th in thead.find_all('th'):
headers.append(th.get_text(strip=True))
# Extract rows
rows = []
tbody = table.find('tbody')
if tbody:
for row in tbody.find_all('tr'):
cols = row.find_all('td')
if cols: # Skip empty rows
row_data = [col.get_text(strip=True) for col in cols]
rows.append(row_data)
# Create DataFrame
if headers and rows:
return pd.DataFrame(rows, columns=headers)
elif rows:
return pd.DataFrame(rows)
else:
return None
def extract_tables_robust(url, table_class=None):
"""Reliable table extraction that works for complex sites"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
}
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
if table_class:
tables = soup.find_all('table', class_=table_class)
else:
tables = soup.find_all('table')
return tables
except Exception as e:
print(f"⚠️ Error fetching {url}: {str(e)}")
return []
def main():
print("🔍 Robust Web Table Extractor")
print("=" * 45)
# Get user inputs
base_url = input("\nEnter website URL pattern (use {} for page number):\n> ")
num_pages = int(input("\nEnter number of pages to scrape:\n> ") or 1)
filename = input("\nEnter output filename (e.g., data.csv or data.xlsx):\n> ")
table_class = input("\nEnter table class (optional, press Enter to extract all tables):\n> ").strip() or None
if not table_class:
print("\n⚠️ Warning: Extracting ALL tables. This may result in messy output!")
proceed = input("Continue? (y/n): ").lower()
if proceed != 'y':
print("Operation cancelled.")
return
all_data = []
page_range = range(1, num_pages + 1) if "{}" in base_url else [1]
print(f"\n⏳ Scraping {len(page_range)} pages...")
# Scrape pages with robust error handling
for page in tqdm(page_range):
try:
url = base_url.format(page) if "{}" in base_url else base_url
tables = extract_tables_robust(url, table_class)
if not tables:
print(f" ⚠️ No tables found on page {page}")
continue
for i, table in enumerate(tables):
try:
# Use manual extraction instead of pd.read_html()
df = manual_table_extraction(table)
if df is None or df.empty:
print(f" ⚠️ Table {i+1} on page {page} is empty")
continue
# Add metadata
df['Source_URL'] = url
df['Table_Index'] = i + 1
if len(page_range) > 1:
df['Page_Number'] = page
all_data.append(df)
print(f" ✅ Processed table {i+1} ({len(df)} rows)")
except Exception as e:
print(f" ⚠️ Error processing table {i+1}: {str(e)}")
except Exception as e:
print(f"🚨 Page {page} failed: {str(e)}")
time.sleep(1.5) # Increased delay
# Process and save results
if not all_data:
print("\n❌ No tables extracted. Possible issues:")
print(" - Website requires JavaScript")
print(" - Table class incorrect")
print(" - Website blocking scrapers")
return
print(f"\n✅ Extracted {len(all_data)} tables with {sum(len(df) for df in all_data)} total rows")
# Save results
if filename.endswith('.csv'):
combined = pd.concat(all_data, ignore_index=True)
combined.to_csv(filename, index=False)
print(f"💾 Saved CSV: {os.path.abspath(filename)}")
elif filename.endswith('.xlsx'):
with pd.ExcelWriter(filename) as writer:
if table_class:
# Combine tables when class is specified
combined = pd.concat(all_data, ignore_index=True)
combined.to_excel(writer, sheet_name="Combined", index=False)
print(f"💾 Saved Excel with combined sheet: {os.path.abspath(filename)}")
else:
# Save each table separately
for idx, df in enumerate(all_data):
page_num = df['Page_Number'].iloc[0] if 'Page_Number' in df.columns else 1
sheet_name = f"Page{page_num}_Table{df['Table_Index'].iloc[0]}"
sheet_name = clean_sheet_name(sheet_name)
df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"💾 Saved Excel with {len(all_data)} sheets: {os.path.abspath(filename)}")
else:
print("❌ Unsupported format. Use .csv or .xlsx")
if __name__ == "__main__":
main()
Press: ctrl + x, then yes
Usage
python xtract.py
There are two types of sites. One having table spread across multiple pages and single spread website
Single spread:
https://en.m.wikipedia.org/wiki/Mobile_telephone_numbering_in_India
Multi spread:
https://www.kotaksecurities.com/margin-requirement/margin-trading/1
https://www.kotaksecurities.com/margin-requirement/margin-trading/2
https://www.kotaksecurities.com/margin-requirement/margin-trading/3
For single spread sites, just enter the site URL. Like this:
https://en.m.wikipedia.org/wiki/Mobile_telephone_numbering_in_India
For Multi spread sites, instead of using last numbers, Use {}. Like this:
https://www.kotaksecurities.com/margin-requirement/margin-trading/{}
Specify number of pages, set file name, and download the excel.
Use:
termux-share filename
to view or share the file