I am having trouble scraping certain data from PDF files in Python. There are no console errors, but when the CSV is produced, the columns Owner's First Name - Zip Code are either filled with the wrong data, or no data. These six columns should be filled in with data grabbed from the left side of the PDF, found under TAX MAP PARCEL NUMBER. other than that, everything else is fine.
Snip of my current CSV, where I have manually inputted some of the missing data
Here is the PDF Link: .pdf
import re
import csv
import pdfplumber
# Define the file path
root_path = "C:\\Users\\jfdal\\OneDrive\\Desktop\\2022"
file_name = "austerlitz_2022_fr.pdf"
file_path = f"{root_path}\\{file_name}"
# Retain your property address pattern
property_address_pattern = r"\n([A-Za-z0-9\s]+(?:\d{1,4}\s[A-Za-z0-9\s]+)?)\n"
acreage_pattern = r"ACRES\s+(\d{1,3}\.\d{1,2})"
value_pattern = r"FULL MARKET VALUE\s+([\d,]+)"
tax_pattern = r"COUNTY TAXABLE VALUE\s+([\d,]+)"
ag_tax_pattern = r"AG\s+DISTRIC\s+41720\s+([\d,]+)"
forest_tax_pattern = r"FOREST\s+LND\s+47460\s+([\d,]+)"
solar_tax_pattern = r"RPTL 487\s+([\d,]+)"
# List to store extracted data
results = []
# Read the entire PDF
with pdfplumber.open(file_path) as pdf:
content = "".join([page.extract_text() for page in pdf.pages]) # Process all pages
# Remove page headers before splitting into property sections
content = re.sub(r"TAX MAP PARCEL NUMBER PROPERTY LOCATION & CLASS.*?ACCOUNT NO\..*?\n", "", content, flags=re.S)
# Split content by property sections
properties = re.split(r"\*{50,}", content)
for prop in properties:
try:
lines = [line.strip() for line in prop.split("\n") if line.strip()]
# Initialize placeholders
prop_id = "0"
owner_first_name, owner_last_name = "0", "0"
street_address, owner_town, owner_state, zip_code = "0", "0", "0", "0"
# First line for Prop ID and Address
if len(lines) > 0:
first_line = lines[0]
# Extract Prop ID from the first line if structured like "123.-1-45"
prop_id_match = re.match(r"(\d{1,3}\.\-?\d{1,3}\-?\d{1,3}\.?\d{0,3})", first_line)
prop_id = prop_id_match.group(1) if prop_id_match else "0"
# Owner's information parsing logic
for line in lines:
if len(line.split()) > 3: # Likely contains owner or address information
if "CURRENT OWNERS NAME" in line:
owner_name_parts = line.replace("CURRENT OWNERS NAME", "").strip().split()
owner_first_name = owner_name_parts[0] if len(owner_name_parts) > 0 else "0"
owner_last_name = owner_name_parts[-1] if len(owner_name_parts) > 1 else "0"
elif "CURRENT OWNERS ADDRESS" in line:
address_parts = line.replace("CURRENT OWNERS ADDRESS", "").strip().split(", ")
if len(address_parts) == 3:
street_address = address_parts[0]
owner_town, owner_state, zip_code = address_parts[1], address_parts[2].split()[0], address_parts[2].split()[1]
# Extract data using regex patterns
property_address_match = re.search(property_address_pattern, prop)
acreage_match = re.search(acreage_pattern, prop)
full_market_value_match = re.search(value_pattern, prop)
tax_value_match = re.search(tax_pattern, prop)
ag_tax_match = re.search(ag_tax_pattern, prop)
forest_tax_match = re.search(forest_tax_pattern, prop)
solar_tax_match = re.search(solar_tax_pattern, prop)
# Handle numeric fields with commas
def parse_numeric(value_match):
return value_match.group(1).replace(",", "") if value_match else "0"
# Get acreage and filter properties with less than 100 acres
acreage = float(acreage_match.group(1)) if acreage_match else 0
if acreage < 100:
continue
# Append extracted results
results.append({
"Property Town": "Austerlitz",
"Prop ID": prop_id,
"Property Address": property_address_match.group(1).strip() if property_address_match else "0",
"Acreage": f"{acreage:.2f}",
"Owner First Name": owner_first_name,
"Owner Last Name": owner_last_name,
"Street Address": street_address,
"Town": owner_town,
"State": owner_state,
"Zip Code": zip_code,
"Full Market Value": parse_numeric(full_market_value_match),
"Tax Value": parse_numeric(tax_value_match),
"AG Tax": parse_numeric(ag_tax_match),
"Forest Tax": parse_numeric(forest_tax_match),
"Solar Tax": parse_numeric(solar_tax_match)
})
except Exception as e:
# Handle parsing errors
print(f"Error parsing property: {e}")
# Write results to CSV (only properties with 100+ acres)
output_file = f"{root_path}\\austerlitz_results.csv"
with open(output_file, 'w', newline='') as csvfile:
fieldnames = [
"Property Town", "Prop ID", "Property Address", "Acreage", "Owner First Name", "Owner Last Name",
"Street Address", "Town", "State", "Zip Code", "Full Market Value", "Tax Value", "AG Tax", "Forest Tax", "Solar Tax"
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
print(f"Data extraction complete. Results saved to {output_file}")
I am expecting the code to produce something similar to my manually typed data. The data with zeroes within the Owner's First Name - Zip Code columns need to be filled with the correct data.