12 Kasiski Test

Author

Jacques Mock Schindler

Published

24.09.2025

Code
import string
import pdfplumber
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from collections import defaultdict
from typing import Optional
Code
def pdf_reader(path: str, start: int = 0, end: int = None) -> str:
    with pdfplumber.open(path) as pdf:
        pages = pdf.pages
        text = ""
        if end is None:
            end = len(pages)
        for i, pg in enumerate(pages):
            if i >= start and i < end:
                text += pages[i].extract_text() + "\n"
    return text
Code
def text_cleaning(text: str) -> str:
    clean = text.upper()
    clean = clean.replace('Ä', 'AE').replace('Ö', 'OE').replace('Ü', 'UE').replace('ß', 'SS')
    clean = clean.replace(' ', '')
    cleaned_text = ''
    for c in clean:
        if c.isalpha():
            cleaned_text += c
    return cleaned_text
Code
def vigenere(text: str, key: str, mode: str) -> str:
    if mode not in ['encrypt', 'decrypt']:
        raise ValueError("Mode must be 'encrypt' or 'decrypt'")
    key_length = len(key)
    if mode == 'encrypt':
        cipher = ''
        for i, char in enumerate(text):
            char_num = ord(char) - ord('A')
            key_num = ord(key[i % key_length]) - ord('A')
            cipher_num = (char_num + key_num) % 26
            cipher += chr(cipher_num + ord('A'))
        return cipher
    else:
        plain = ''
        for i, char in enumerate(text):
            char_num = ord(char) - ord('A')
            key_num = ord(key[i % key_length]) - ord('A')
            plain_num = (char_num - key_num) % 26
            plain += chr(plain_num + ord('A'))
        return plain
Code
def _get_factors(n):
    factors = set()
    for i in range(2, int(math.sqrt(n)) + 1):
        if n % i == 0:
            factors.add(i)
            factors.add(n // i)
    if n > 1:
        factors.add(n)
    return list(factors)
Code
def kasiski_test(ciphertext, min_seq_len=3, max_seq_len=5):
    sequences_positions = defaultdict(list)
    for seq_len in range(min_seq_len, max_seq_len + 1):
        for i in range(len(ciphertext) - seq_len + 1):
            sequence = ciphertext[i:i+seq_len]
            sequences_positions[sequence].append(i)
    distances = []
    for sequence, positions in sequences_positions.items():
        if len(positions) > 1:
            for i in range(len(positions) - 1):
                distance = positions[i+1] - positions[i]
                distances.append(distance)
    factor_counts = defaultdict(int)
    for dist in distances:
        factors = _get_factors(dist)
        for factor in factors:
            factor_counts[factor] += 1
    sorted_factor_counts = sorted(factor_counts.items(), key=lambda item: item[1], reverse=True)
    return dict(sorted_factor_counts)
Code
# Read and clean the PDF text (truncate to avoid huge outputs)
economist_plain = pdf_reader('economist.pdf')
economist_plain = economist_plain.replace('■', ' ').replace('\r', ' ')
economist_plain = ' '.join(economist_plain.split())
economist_plain = text_cleaning(economist_plain)[:20000]
# show only the first 10 characters to avoid large verbatim output
print(economist_plain[:10])
BUSINESSHB
Code
def kasiski_test(ciphertext, min_seq_len=3, max_seq_len=5):
    """
    Performs the Kasiski test on a given Vigenère-encrypted text.

    The Kasiski test is a cryptanalytic method used to determine the likely
    key length of a Vigenère cipher. It identifies repeated character sequences,
    measures the distances between their occurrences, and analyzes the factors
    of these distances to find probable key lengths.

    The test works on the principle that when the same plaintext sequence is
    encrypted with the same portion of the repeating key, it produces the same
    ciphertext sequence. The distance between such repetitions is likely to be
    a multiple of the key length.

    Args:
        ciphertext (str): The encrypted text to analyze.
        min_seq_len (int, optional): Minimum length of character sequences to search for.
                                   Defaults to 3. Shorter sequences may produce noise.
        max_seq_len (int, optional): Maximum length of character sequences to search for.
                                   Defaults to 5. Longer sequences are less likely to repeat.

    Returns:
        dict: A dictionary where keys are possible key lengths and values are their
              frequency counts, sorted by frequency in descending order.
              Higher frequencies suggest more likely key lengths.

    Example:
        >>> kasiski_test("ABCDEFABCDEF")
        {6: 1, 3: 1, 2: 1}  # Key length 6 most likely
    """
    # Step 1: Find positions of repeated sequences in the ciphertext
    # Use defaultdict to automatically initialize empty lists for new sequences
    sequences_positions = defaultdict(list)
    
    # Iterate through all possible sequence lengths within the specified range
    for seq_len in range(min_seq_len, max_seq_len + 1):
        # Extract all possible sequences of current length from the ciphertext
        # Stop at len(ciphertext) - seq_len + 1 to avoid index out of bounds
        for i in range(len(ciphertext) - seq_len + 1):
            # Extract sequence starting at position i with length seq_len
            sequence = ciphertext[i:i+seq_len]
            # Record the starting position of this sequence
            sequences_positions[sequence].append(i)

    # Step 2: Calculate distances between occurrences of repeated sequences
    distances = []
    
    # Process each sequence and its list of positions
    for sequence, positions in sequences_positions.items():
        # Only consider sequences that appear more than once
        if len(positions) > 1:
            # Calculate distances between consecutive occurrences
            for i in range(len(positions) - 1):
                # Distance is the difference between consecutive positions
                distance = positions[i+1] - positions[i]
                distances.append(distance)

    # Step 3: Find factors of all distances and count their frequencies
    # Factors that appear frequently are likely to be multiples of the key length
    factor_counts = defaultdict(int)
    
    # For each distance, find all its factors
    for dist in distances:
        factors = _get_factors(dist)  # Uses the helper function from earlier
        
        # Increment the count for each factor
        # Factors that appear more often across different distances
        # are more likely to represent the actual key length
        for factor in factors:
            factor_counts[factor] += 1

    # Step 4: Sort results by frequency in descending order
    # The most frequent factors are the most likely key lengths
    sorted_factor_counts = sorted(
        factor_counts.items(),        # Convert to list of (factor, count) tuples
        key=lambda item: item[1],     # Sort by count (second element of tuple)
        reverse=True                  # Sort in descending order (highest first)
    )

    # Convert back to dictionary and return
    # Most likely key lengths will appear first in the dictionary
    return dict(sorted_factor_counts)
Code
def string_2_dataframe(text: str, n: int) -> Optional[pd.DataFrame]:
    """
    Converts a string into a Pandas DataFrame with n columns.

    The characters of the string are distributed across columns such that the
    first character goes to the first column, the second to the second column,
    ..., the n-th to the n-th column, the (n+1)-th back to the first column,
    and so on in a cyclic pattern.

    This function is useful for cryptographic analysis (e.g., analyzing
    columnar transposition ciphers) or for reformatting text data into
    a tabular structure for analysis.

    Args:
        text (str): The input string to be converted into DataFrame format.
        n (int): The number of columns in the resulting DataFrame. 
                Must be greater than 0.

    Returns:
        Optional[pd.DataFrame]: A Pandas DataFrame with the converted string
                              arranged in n columns, or None if n <= 0.
                              Shorter columns are padded with NaN values.

    Example:
        >>> string_2_dataframe("ABCDEFGHIJ", 3)
           Spalte_1 Spalte_2 Spalte_3
        0        A        B        C
        1        D        E        F
        2        G        H        I
        3        J      NaN      NaN
    """
    # Input validation: ensure n is positive
    if n <= 0:
        print("Fehler: Die Anzahl der Spalten (n) muss größer als 0 sein.")
        return None

    # Step 1: Create lists of lists, where each inner list contains data for one column
    # The slice text[i::n] takes every n-th character starting from the i-th character
    # This distributes characters cyclically across columns:
    # - Column 0 gets characters at indices 0, n, 2n, 3n, ...
    # - Column 1 gets characters at indices 1, n+1, 2n+1, 3n+1, ...
    # - Column i gets characters at indices i, n+i, 2n+i, 3n+i, ...
    daten_listen = [list(text[i::n]) for i in range(n)]

    # Step 2: Find the maximum length among all column lists
    # This is necessary because the last few columns might be shorter
    # if the string length is not evenly divisible by n
    if not daten_listen:
        # Handle edge case where n=0 (though this should be caught above)
        max_laenge = 0
    else:
        # Find the length of the longest column
        max_laenge = max(len(lst) for lst in daten_listen)

    # Step 3: Pad shorter lists with NaN values to ensure equal length
    # This is required because pandas DataFrames need all columns to have
    # the same number of rows. NaN represents missing/empty values.
    for lst in daten_listen:
        # Calculate how many NaN values need to be added
        padding_needed = max_laenge - len(lst)
        # Extend the list with the required number of NaN values
        lst.extend([np.nan] * padding_needed)

    # Step 4: Create descriptive column names
    # Using 1-based indexing for user-friendly column names
    spalten_namen = [f"Spalte_{i+1}" for i in range(n)]

    # Step 5: Create a dictionary mapping column names to their data
    # This dictionary structure is one of the most efficient ways to create
    # a pandas DataFrame from column-oriented data
    daten_dict = dict(zip(spalten_namen, daten_listen))

    # Step 6: Create the DataFrame from the dictionary
    # pandas will automatically align all columns to have the same length
    df = pd.DataFrame(daten_dict)

    return df
Code
def berechne_distanz_zu_e(dataframe: pd.DataFrame) -> pd.Series:
    """
    Analyzes a DataFrame and calculates the distance from the most frequent
    letter in each column to the letter 'E'. The distance is always calculated
    cyclically in the direction towards Z (forward in the alphabet).

    This function is particularly useful for cryptographic analysis, especially
    for Caesar cipher cryptanalysis. Since 'E' is the most frequent letter in
    English and German texts, the distance from the most frequent letter in
    encrypted text to 'E' often reveals the Caesar cipher shift value.

    The cyclic distance calculation means that after 'Z', the alphabet wraps
    around to 'A'. For example:
    - Distance from 'A' to 'E': 4 steps forward (A→B→C→D→E)
    - Distance from 'Z' to 'E': 5 steps forward (Z→A→B→C→D→E)

    Args:
        dataframe (pd.DataFrame): A DataFrame containing letters/characters.
                                Each column will be analyzed independently.

    Returns:
        pd.Series: A pandas Series containing the calculated distance for each
                  column. The index contains the column names from the original
                  DataFrame. Columns containing only NaN values are skipped.

    Example:
        >>> df = pd.DataFrame({'Col1': ['A', 'A', 'B'], 'Col2': ['C', 'C', 'C']})
        >>> berechne_distanz_zu_e(df)
        Col1    4    # Most frequent 'A', distance A→E is 4
        Col2    2    # Most frequent 'C', distance C→E is 2
        dtype: int64
    """
    # Input validation: check if DataFrame is empty
    if dataframe.empty:
        # Return empty Series with integer dtype for consistency
        return pd.Series(dtype=int)

    # Define the target position: 'E' in the alphabet (0-indexed: A=0, B=1, ..., E=4)
    # This is our reference point for distance calculations
    pos_e = ord('E') - ord('A')  # Position of 'E' = 4
    
    # Dictionary to store the calculated distances for each column
    distanzen = {}

    # Iterate through each column in the DataFrame
    for spalten_name in dataframe.columns:
        # Extract the current column for analysis
        spalte = dataframe[spalten_name]

        # Find the most frequent letter in the current column
        # .dropna() removes NaN values to avoid errors in frequency analysis
        # .mode() returns the most frequent value(s) as a Series
        if spalte.dropna().empty:
            # Skip columns that contain only NaN values
            # This prevents errors and maintains data integrity
            continue
            
        # Get the first (most frequent) value from the mode result
        # If there are ties, .mode()[0] takes the first one alphabetically
        haeufigster_buchstabe = spalte.dropna().mode()[0]
        
        # Convert to uppercase for consistent processing
        # This ensures that both 'a' and 'A' are treated as the same letter
        haeufigster_buchstabe_gross = str(haeufigster_buchstabe).upper()
        
        # Calculate the alphabet position of the most frequent letter
        # ord() returns ASCII value, subtracting ord('A') gives 0-based position
        pos_haeufigster = ord(haeufigster_buchstabe_gross) - ord('A')
        
        # Calculate cyclic distance from most frequent letter to 'E'
        # Formula: (target_pos - source_pos + 26) % 26
        # The +26 ensures positive result before modulo operation
        # Example calculations:
        # - From 'A' (pos=0) to 'E' (pos=4): (4-0+26)%26 = 4
        # - From 'Z' (pos=25) to 'E' (pos=4): (4-25+26)%26 = 5
        # - From 'F' (pos=5) to 'E' (pos=4): (4-5+26)%26 = 25
        distanz = (pos_e - pos_haeufigster + 26) % 26
        
        # Store the calculated distance in the results dictionary
        distanzen[spalten_name] = distanz
        
    # Convert the dictionary to a pandas Series and return
    # The Series index will contain the column names, values contain distances
    return pd.Series(distanzen)
Code
economist_plain = pdf_reader('economist.pdf')
# basic cleaning: remove problematic glyphs and normalize whitespace
economist_plain = economist_plain.replace('■', ' ').replace('\r', ' ')
# collapse multiple whitespace/newlines into single spaces so verbatim blocks stay reasonable
economist_plain = ' '.join(economist_plain.split())
# normalize to A-Z using the notebook's text_cleaning() and truncate to avoid enormous verbatim blocks
economist_plain = text_cleaning(economist_plain)[:20000]
# show only a short sample to avoid embedding a huge unbroken token in outputs
print('len(economist_plain)=', len(economist_plain))
print(economist_plain[:100])
len(economist_plain)= 3631
BUSINESSHBSTINGTHEPERVERSECONSEQUENCEOFAMERICASVISAFEESOFFSHORINGTOINDIAANDOTHERCOUNTRIESCOULDACCELE
Code
economist_cipher = vigenere(economist_plain, 'bueli', 'encrypt')
# show only first 10 chars
print(economist_cipher[:10])
IUCZBLSCYP
Code
key_lengths = kasiski_test(economist_cipher, 3, 10)
# print only first 10 characters of the dict's string representation
print(str(key_lengths)[:10])
{5: 1589, 
Code
df = string_2_dataframe(economist_cipher, 5)
# print only first 10 characters of the DataFrame head's string repr to avoid large output
print(str(df.head())[:10])
  Spalte_1
Code
shifts = berechne_distanz_zu_e(df)
print(str(shifts)[:10])
Spalte_1