import re
# Sample texts
texts = [
"The meeting is on 2024-06-15 in New York.",
"We will travel to San Francisco on June 20th, 2024.",
"Deadline: 15/06/2024, Location: Berlin.",
"Event date: 2024/06/15, place: London."
]
# Improved patterns
# Date pattern to match YYYY-MM-DD, YYYY/MM/DD, DD/MM/YYYY, Month DDth, YYYY
date_pattern = re.compile(r"(\b\d{4}[-/]\d{2}[-/]\d{2}\b|\b\d{2}/\d{2}/\d{4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(?:st|nd|rd|th)?,? \d{4}\b)", re.IGNORECASE)
# Location pattern to match capitalized words (simple heuristic for demo)
location_pattern = re.compile(r"\b([A-Z][a-z]+(?: [A-Z][a-z]+)*)\b")
extracted_info = []
for text in texts:
dates = date_pattern.findall(text)
locations = location_pattern.findall(text)
# Filter locations to exclude words that are months or common words
months = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"}
filtered_locations = [loc for loc in locations if loc not in months and len(loc) > 2]
extracted_info.append({"text": text, "dates": dates, "locations": filtered_locations})
for info in extracted_info:
print(f"Text: {info['text']}")
print(f"Extracted Dates: {info['dates']}")
print(f"Extracted Locations: {info['locations']}\n")