dialect-map / scripts /csv_to_json.py
Kakashi75's picture
Modified and added automation
a25ac93
#!/usr/bin/env python3
"""
CSV to JSON Converter
Converts dialect CSV files from sheets_output/ to JSON format for index.html
"""
import csv
import json
import os
from pathlib import Path
from collections import defaultdict
# Paths
BASE_DIR = Path(__file__).parent.parent
SHEETS_OUTPUT_DIR = BASE_DIR / "sheets_output"
JSON_OUTPUT_DIR = BASE_DIR / "data" / "processed"
def convert_processed_dialects():
"""Convert processed_dialects.csv to JSON format"""
csv_file = SHEETS_OUTPUT_DIR / "processed_dialects.csv"
json_file = JSON_OUTPUT_DIR / "processed_dialects.json"
if not csv_file.exists():
print(f"⚠️ CSV file not found: {csv_file}")
return False
try:
districts = []
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
# Group words by district
district_words = defaultdict(lambda: {
'name': '',
'lat': 0,
'lng': 0,
'region': '',
'history': '',
'words': []
})
for row in reader:
district_name = row.get('District', '').strip()
if not district_name:
continue
# Set district metadata (from first occurrence)
if not district_words[district_name]['name']:
district_words[district_name]['name'] = district_name
district_words[district_name]['lat'] = float(row.get('Latitude', 0))
district_words[district_name]['lng'] = float(row.get('Longitude', 0))
district_words[district_name]['region'] = row.get('Region', '')
district_words[district_name]['history'] = row.get('History', '')
# Add word entry
word_entry = {
't': row.get('Telugu_Word', ''),
'm': row.get('Meaning', ''),
's': row.get('Source', '')
}
if word_entry['t']: # Only add if Telugu word exists
district_words[district_name]['words'].append(word_entry)
# Convert to list
districts = list(district_words.values())
# Write JSON
JSON_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(districts, f, ensure_ascii=False, indent=2)
print(f"βœ… Converted {csv_file.name} β†’ {json_file.name}")
print(f" {len(districts)} districts, {sum(len(d['words']) for d in districts)} words")
return True
except Exception as e:
print(f"❌ Error converting processed_dialects.csv: {e}")
return False
def convert_digiwords_grouped():
"""Convert digiwords_grouped.csv to JSON format"""
csv_file = SHEETS_OUTPUT_DIR / "digiwords_grouped.csv"
json_file = JSON_OUTPUT_DIR / "digiwords_grouped.json"
if not csv_file.exists():
print(f"⚠️ CSV file not found: {csv_file}")
return False
try:
data = {
"Telangana": defaultdict(list),
"Andhra Pradesh": defaultdict(list)
}
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
state = row.get('State', '').strip()
district = row.get('District', '').strip()
if not state or not district:
continue
word_entry = {
't': row.get('Telugu_Word', ''),
'm': row.get('Meaning', ''),
's': row.get('Source', 'Crowd')
}
if word_entry['t'] and state in data:
data[state][district].append(word_entry)
# Convert defaultdict to regular dict
output = {
state: dict(districts)
for state, districts in data.items()
}
# Write JSON
JSON_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
tg_count = sum(len(words) for words in output.get("Telangana", {}).values())
ap_count = sum(len(words) for words in output.get("Andhra Pradesh", {}).values())
print(f"βœ… Converted {csv_file.name} β†’ {json_file.name}")
print(f" Telangana: {len(output.get('Telangana', {}))} districts, {tg_count} words")
print(f" Andhra Pradesh: {len(output.get('Andhra Pradesh', {}))} districts, {ap_count} words")
return True
except Exception as e:
print(f"❌ Error converting digiwords_grouped.csv: {e}")
return False
def main():
"""Convert all CSV files to JSON"""
print("πŸ”„ Starting CSV to JSON conversion...")
print(f"πŸ“‚ Input: {SHEETS_OUTPUT_DIR}")
print(f"πŸ“‚ Output: {JSON_OUTPUT_DIR}\n")
success_count = 0
if convert_processed_dialects():
success_count += 1
if convert_digiwords_grouped():
success_count += 1
print(f"\n✨ Conversion complete: {success_count}/2 files successfully converted")
if __name__ == "__main__":
main()