Merge branch 'paper' into revision-2.0

This commit is contained in:
LSaldyt
2017-12-04 12:06:12 -07:00
29 changed files with 164578 additions and 17 deletions

7
.gitignore vendored
View File

@ -31,3 +31,10 @@ copycat.log
# Output
output/*
copycat.log
papers/*.log
papers/*.pdf
papers/*.out
papers/*.aux
papers/words
*.txt

View File

@ -1,7 +1,9 @@
co.py.cat
=========
I am planning to use this codebase, or Joseph A. Hager's, to implement a variation of Copycat that uses *Entropy* instead of *Temperature*, while still preserving the parallel terraced scan in full form. If the change is viable, I plan to write a paper on that (if anyone is interested in co-authoring, let me know). For the general idea, please see pages 41 and 42 of the [*Information Sciences*](https://github.com/Alex-Linhares/FARGlexandria/blob/master/Literature/Chess-Capyblanca-2014-Linhares-Information%20Sciences.pdf) paper on [Capyblanca](https://github.com/Alex-Linhares/FARGlexandria).
![GUI](https://i.imgur.com/7pb20g0.png)
Linhares and I are planning to use this codebase to implement a variation of Copycat that uses *Entropy* instead of *Temperature*, while still preserving the parallel terraced scan in full form. If the change is viable, I plan to write a paper on that (if anyone is interested in co-authoring, let me know). For the general idea, please see pages 41 and 42 of the [*Information Sciences*](https://github.com/Alex-Linhares/FARGlexandria/blob/master/Literature/Chess-Capyblanca-2014-Linhares-Information%20Sciences.pdf) paper on [Capyblanca](https://github.com/Alex-Linhares/FARGlexandria).
**If you would like to help research and publish a paper, please let me know.**

View File

@ -22,7 +22,6 @@ def codelet(name):
return f
return wrap
# some methods common to the codelets
def __showWhichStringObjectIsFrom(structure):
if not structure:

View File

@ -3,6 +3,7 @@ from .randomness import Randomness
from .slipnet import Slipnet
from .temperature import Temperature
from .workspace import Workspace
from .gui import GUI
from pprint import pprint
@ -25,30 +26,48 @@ class Reporter(object):
class Copycat(object):
def __init__(self, rng_seed=None, reporter=None):
def __init__(self, rng_seed=None, reporter=None, gui=False):
self.coderack = Coderack(self)
self.random = Randomness(rng_seed)
self.slipnet = Slipnet()
self.temperature = Temperature() # TODO: use entropy
self.workspace = Workspace(self)
self.reporter = reporter or Reporter()
if gui:
self.gui = GUI('Copycat')
self.lastUpdate = float('-inf')
def mainLoop(self, lastUpdate):
currentTime = self.coderack.codeletsRun
self.temperature.tryUnclamp(currentTime) # TODO: use entropy
# Every 15 codelets, we update the workspace.
if currentTime >= lastUpdate + 15:
self.workspace.updateEverything()
self.coderack.updateCodelets()
self.slipnet.update(self.random)
self.temperature.update(self.workspace.getUpdatedTemperature()) # TODO: use entropy
lastUpdate = currentTime
self.reporter.report_slipnet(self.slipnet)
def step(self):
self.coderack.chooseAndRunCodelet()
self.reporter.report_coderack(self.coderack)
self.reporter.report_temperature(self.temperature)
self.reporter.report_workspace(self.workspace)
return lastUpdate
def update_workspace(self, currentTime):
self.workspace.updateEverything()
self.coderack.updateCodelets()
self.slipnet.update(self.random)
self.temperature.update(self.workspace.getUpdatedTemperature())
self.lastUpdate = currentTime
self.reporter.report_slipnet(self.slipnet)
def check_reset(self):
if self.gui.app.primary.control.go:
initial, modified, target = self.gui.app.primary.control.get_vars()
self.gui.app.reset_with_strings(initial, modified, target)
self.workspace.resetWithStrings(initial, modified, target)
return True
else:
return False
def mainLoop(self):
currentTime = self.coderack.codeletsRun
self.temperature.tryUnclamp(currentTime)
# Every 15 codelets, we update the workspace.
if currentTime >= self.lastUpdate + 15:
self.update_workspace(currentTime)
self.step()
def runTrial(self):
"""Run a trial of the copycat algorithm"""
@ -56,9 +75,8 @@ class Copycat(object):
self.slipnet.reset()
self.temperature.reset() # TODO: use entropy
self.workspace.reset()
lastUpdate = float('-inf')
while self.workspace.finalAnswer is None:
lastUpdate = self.mainLoop(lastUpdate)
self.mainLoop()
answer = {
'answer': self.workspace.finalAnswer,
'temp': self.temperature.last_unclamped_value, # TODO: use entropy
@ -67,6 +85,32 @@ class Copycat(object):
self.reporter.report_answer(answer)
return answer
def runGUI(self):
while not self.check_reset():
self.gui.update(self)
self.gui.refresh()
answers = {}
while True:
if self.check_reset():
answers = {}
self.gui.refresh()
if not self.gui.paused():
answer = self.runTrial()
self.gui.update(self)
d = answers.setdefault(answer['answer'], {
'count': 0,
'sumtemp': 0,
'sumtime': 0
})
d['count'] += 1
d['sumtemp'] += answer['temp']
d['sumtime'] += answer['time']
self.gui.add_answers(answers)
for answer, d in answers.items():
d['avgtemp'] = d.pop('sumtemp') / d['count']
d['avgtime'] = d.pop('sumtime') / d['count']
def run(self, initial, modified, target, iterations):
self.workspace.resetWithStrings(initial, modified, target)

1
copycat/gui/__init__.py Normal file
View File

@ -0,0 +1 @@
from .gui import GUI

59
copycat/gui/control.py Normal file
View File

@ -0,0 +1,59 @@
import tkinter as tk
import tkinter.ttk as ttk
from .gridframe import GridFrame
from .entry import Entry
class Control(GridFrame):
def __init__(self, parent, *args, **kwargs):
GridFrame.__init__(self, parent, *args, **kwargs)
self.paused = True
self.steps = 0
self.go = False
self.playbutton = ttk.Button(self, text='Play', command=lambda : self.toggle())
self.add(self.playbutton, 0, 0)
self.stepbutton = ttk.Button(self, text='Step', command=lambda : self.step())
self.add(self.stepbutton, 1, 0)
self.entry = Entry(self)
self.add(self.entry, 0, 1, xspan=2)
self.gobutton = ttk.Button(self, text='Go', command=lambda : self.set_go())
self.add(self.gobutton, 0, 2, xspan=2)
def play(self):
self.paused = False
self.playbutton['text'] = 'Pause'
def pause(self):
self.paused = True
self.playbutton['text'] = 'Play'
def toggle(self):
if self.paused:
self.play()
else:
self.pause()
def step(self):
self.steps += 1
def has_step(self):
if self.steps > 0:
self.steps -= 1
return True
else:
return False
def set_go(self):
self.go = True
self.play()
def get_vars(self):
return self.entry.a.get(), self.entry.b.get(), self.entry.c.get()
def reset(self):
self.go = False

27
copycat/gui/entry.py Normal file
View File

@ -0,0 +1,27 @@
import tkinter as tk
import tkinter.ttk as ttk
from .gridframe import GridFrame
class Entry(GridFrame):
def __init__(self, parent, *args, **kwargs):
GridFrame.__init__(self, parent, *args, **kwargs)
self.aLabel = ttk.Label(self, text='Initial:')
self.a = ttk.Entry(self, style='EntryStyle.TEntry')
self.add(self.aLabel, 0, 0)
self.add(self.a, 0, 1)
self.bLabel = ttk.Label(self, text='Final:')
self.b = ttk.Entry(self, style='EntryStyle.TEntry')
self.add(self.bLabel, 1, 0)
self.add(self.b, 1, 1)
self.cLabel = ttk.Label(self, text='Next:')
self.c = ttk.Entry(self, style='EntryStyle.TEntry')
self.add(self.cLabel, 2, 0)
self.add(self.c, 2, 1)
GridFrame.configure(self)

11
copycat/gui/gridframe.py Normal file
View File

@ -0,0 +1,11 @@
import tkinter as tk
import tkinter.ttk as ttk
class GridFrame(tk.Frame):
def __init__(self, parent, *args, **kwargs):
ttk.Frame.__init__(self, parent, *args, **kwargs)
def add(self, element, x, y, xspan=1, yspan=1):
element.grid(column=x, row=y, columnspan=xspan, rowspan=yspan, sticky=tk.N+tk.E+tk.S+tk.W)
tk.Grid.rowconfigure(self, x, weight=1)
tk.Grid.columnconfigure(self, y, weight=1)

91
copycat/gui/gui.py Normal file
View File

@ -0,0 +1,91 @@
import sys
import time
import tkinter as tk
import tkinter.ttk as ttk
from tkinter import scrolledtext
from tkinter import filedialog
import matplotlib.pyplot as plt
from .status import Status, StatusFrame
from .status import Plot
from .gridframe import GridFrame
from .primary import Primary
from .list import List
from .style import configure_style
from .plot import plot_imbedded
plt.style.use('dark_background')
class MainApplication(GridFrame):
def __init__(self, parent, *args, **kwargs):
GridFrame.__init__(self, parent, *args, **kwargs)
self.parent = parent
self.primary = Primary(self, *args, **kwargs)
self.add(self.primary, 0, 0, xspan=2)
self.create_widgets()
GridFrame.configure(self)
def create_widgets(self):
columns = 20
self.slipList = List(self, columns)
self.add(self.slipList, 0, 1)
self.codeletList = List(self, columns)
self.add(self.codeletList, 1, 1)
self.objectList = List(self, columns)
self.add(self.objectList, 2, 1)
self.graph2 = Plot(self, 'Answer Distribution')
self.add(self.graph2, 2, 0)
def update(self, copycat):
self.primary.update(copycat)
slipnodes = copycat.slipnet.slipnodes
codelets = copycat.coderack.codelets
objects = copycat.workspace.objects
self.slipList.update(slipnodes, key=lambda s:s.activation,
formatter=lambda s : '{}: {}'.format(s.name, round(s.activation, 2)))
self.codeletList.update(codelets, key=lambda c:c.urgency, formatter= lambda s : '{}: {}'.format(s.name, round(s.urgency, 2)))
get_descriptors = lambda s : ', '.join('({}={})'.format(d.descriptionType.name, d.descriptor.name) for d in s.descriptions)
self.objectList.update(objects, formatter=lambda s : '{}: {}'.format(s, get_descriptors(s)))
def reset_with_strings(self, initial, modified, target):
self.primary.reset_with_strings(initial, modified, target)
class GUI(object):
def __init__(self, title):
self.root = tk.Tk()
self.root.title(title)
tk.Grid.rowconfigure(self.root, 0, weight=1)
tk.Grid.columnconfigure(self.root, 0, weight=1)
self.app = MainApplication(self.root)
self.app.grid(row=0, column=0, sticky=tk.N+tk.S+tk.E+tk.W)
configure_style(ttk.Style())
def add_answers(self, answers):
def modifier(status):
print('Here')
print(answers)
with plt.style.context(('dark_background')):
plot_imbedded(answers, status)
self.app.graph2.status.modifier = modifier
def refresh(self):
self.root.update_idletasks()
self.root.update()
def paused(self):
return self.app.primary.control.paused
def update(self, copycat):
self.app.update(copycat)

26
copycat/gui/list.py Normal file
View File

@ -0,0 +1,26 @@
import tkinter as tk
import tkinter.ttk as ttk
import time
from .gridframe import GridFrame
class List(GridFrame):
def __init__(self, parent, columns, updateInterval=.1):
GridFrame.__init__(self, parent)
self.text = ttk.Label(self, anchor='w', justify=tk.LEFT, width=30)
self.add(self.text, 0, 0)
self.columns = columns
self.lastUpdated = time.time()
self.updateInterval = updateInterval
def update(self, l, key=None, reverse=False, formatter=lambda s : str(s)):
current = time.time()
if current - self.lastUpdated > self.updateInterval:
l = l[:self.columns]
if key is not None:
l = sorted(l, key=key, reverse=False)
self.text['text'] = '\n'.join(map(formatter, l))

17
copycat/gui/plot.py Normal file
View File

@ -0,0 +1,17 @@
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
def plot_imbedded(answers, status):
answers = sorted(answers.items(), key=lambda kv : kv[1]['count'])
objects = [t[0] for t in answers]
yvalues = [t[1]['count'] for t in answers]
y_pos = np.arange(len(objects))
status.subplot.clear()
status.subplot.bar(y_pos, yvalues, align='center', alpha=0.5)
status.subplot.set_xticks(y_pos)
status.subplot.set_xticklabels(tuple(objects))
status.subplot.set_ylabel('Count')
status.subplot.set_title('Answers')

30
copycat/gui/primary.py Normal file
View File

@ -0,0 +1,30 @@
import tkinter as tk
import tkinter.ttk as ttk
from tkinter import scrolledtext
from tkinter import filedialog
from .control import Control
from .gridframe import GridFrame
from .workspacecanvas import WorkspaceCanvas
class Primary(GridFrame):
def __init__(self, parent, *args, **kwargs):
GridFrame.__init__(self, parent, *args, **kwargs)
self.canvas = WorkspaceCanvas(self)
self.add(self.canvas, 0, 0, xspan=2)
self.control = Control(self)
self.add(self.control, 0, 2)
GridFrame.configure(self)
def update(self, copycat):
self.canvas.update(copycat)
def reset_with_strings(self, initial, modified, target):
self.canvas.reset_with_strings(initial, modified, target)
self.control.reset()

66
copycat/gui/status.py Normal file
View File

@ -0,0 +1,66 @@
import matplotlib
matplotlib.use("TkAgg")
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
from matplotlib.figure import Figure
import tkinter as tk
import tkinter.ttk as ttk
import time
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.style.use('dark_background')
from .gridframe import GridFrame
class Plot(GridFrame):
def __init__(self, parent, title):
GridFrame.__init__(self, parent)
self.status = Status()
self.sframe = StatusFrame(self, self.status, title)
self.add(self.sframe, 0, 0, xspan=2)
self.savebutton = ttk.Button(self, text='Save to path:', command=lambda : self.save())
self.add(self.savebutton, 0, 1)
self.pathentry = ttk.Entry(self, style='EntryStyle.TEntry', textvariable='output/dist.png')
self.add(self.pathentry, 1, 1)
def save(self):
path = self.pathentry.get()
if len(path) > 0:
try:
self.status.figure.savefig(path)
except Exception as e:
print(e)
class StatusFrame(ttk.Frame):
def __init__(self, parent, status, title):
ttk.Frame.__init__(self, parent)
self.status = status
self.canvas = FigureCanvasTkAgg(status.figure, self)
self.canvas.show()
self.canvas.get_tk_widget().pack(side=tk.BOTTOM, fill=tk.BOTH, expand=True)
self.animation = animation.FuncAnimation(status.figure, lambda i : status.update_plots(i), interval=1000)
class Status(object):
def __init__(self):
self.figure = Figure(figsize=(5,5), dpi=100)
self.subplot = self.figure.add_subplot(111)
self.x = []
self.y = []
def modifier(status):
with plt.style.context(('dark_background')):
status.subplot.plot(status.x, status.y)
self.modifier = modifier
self.update_plots(0)
def update_plots(self, i):
self.subplot.clear()
self.modifier(self)

33
copycat/gui/style.py Normal file
View File

@ -0,0 +1,33 @@
style_dict = dict(foreground='white',
background='black')
map_options = dict(
foreground=[('disabled', 'black'),
('pressed', 'white'),
('active', 'white')],
background=[('disabled', 'black'),
('pressed', '!focus', 'black'),
('active', 'black')],
highlightcolor=[('focus', 'black'),
('!focus', 'black')])
def configure_style(style):
style.configure('TButton', **style_dict)
style.map('TButton', **map_options)
style.configure('TLabel', **style_dict)
#style.configure('TEntry', **style_dict)
#style.map('TEntry', **map_options)
# A hack to change entry style
style.element_create("plain.field", "from", "clam")
style.layout("EntryStyle.TEntry",
[('Entry.plain.field', {'children': [(
'Entry.background', {'children': [(
'Entry.padding', {'children': [(
'Entry.textarea', {'sticky': 'nswe'})],
'sticky': 'nswe'})], 'sticky': 'nswe'})],
'border':'2', 'sticky': 'nswe'})])
style.configure("EntryStyle.TEntry",
background="black",
foreground="white",
fieldbackground="black")

159787
copycat/gui/sys Normal file

File diff suppressed because it is too large Load Diff

3093
copycat/gui/time Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,69 @@
import tkinter as tk
import tkinter.ttk as ttk
from .gridframe import GridFrame
font1Size = 32
font1 = ('Helvetica', font1Size)
class WorkspaceCanvas(GridFrame):
def __init__(self, parent, *args, **kwargs):
GridFrame.__init__(self, parent, *args, **kwargs)
self.chars = []
self.initial = ''
self.modified = ''
self.target = ''
self.answer = ''
self.changed = False
self.canvas = tk.Canvas(self, background='black')
self.add(self.canvas, 0, 0)
GridFrame.configure(self)
def update(self, copycat):
answer = '' if copycat.workspace.rule is None else copycat.workspace.rule.buildTranslatedRule()
if answer != self.answer:
self.changed = True
if self.changed:
self.canvas.delete('all')
del self.chars[:]
self.add_text()
def add_text(self):
padding = 100
def add_sequences(sequences, x, y):
for sequence in sequences:
x += padding
if sequence is None:
sequence = ''
for char in sequence:
self.chars.append((char, (x, y)))
self.canvas.create_text(x, y, text=char, anchor=tk.NW, font=font1, fill='white')
x += font1Size
return x, y
x = 0
y = padding
add_sequences([self.initial, self.modified], x, y)
x = 0
y += padding
add_sequences([self.target, self.answer], x, y)
def reset_with_strings(self, initial, modified, target):
if initial != self.initial or \
modified != self.modified or \
target != self.target:
self.changed = True
self.initial = initial
self.modified = modified
self.target = target

4
copycat/sampleText.txt Normal file
View File

@ -0,0 +1,4 @@
1,2
3,4
7,7
100,100

View File

@ -23,6 +23,16 @@ class Workspace(object):
self.intraStringUnhappiness = 0.0
self.interStringUnhappiness = 0.0
# LSaldyt: default initializations for GUI entry
self.targetString = ''
self.initialString = ''
self.modifiedString = ''
self.finalAnswer = None
self.changedObject = None
self.objects = []
self.structures = []
self.rule = None
def __repr__(self):
return '<Workspace trying %s:%s::%s:?>' % (
self.initialString, self.modifiedString, self.targetString,

24
gui.py Executable file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env python3
import argparse
import logging
from copycat import Copycat, Reporter
class SimpleReporter(Reporter):
def report_answer(self, answer):
print('Answered %s (time %d, final temperature %.1f)' % (
answer['answer'], answer['time'], answer['temp'],
))
def main():
logging.basicConfig(level=logging.INFO, format='%(message)s', filename='./output/copycat.log', filemode='w')
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=None, help='Provide a deterministic seed for the RNG.')
options = parser.parse_args()
copycat = Copycat(reporter=SimpleReporter(), rng_seed=options.seed, gui=True)
copycat.runGUI()
if __name__ == '__main__':
main()

336
papers/draft.tex Normal file
View File

@ -0,0 +1,336 @@
\documentclass[a4paper]{article}
%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
%% Sets page size and margins
\usepackage[a4paper,top=3cm,bottom=2cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry}
%% Useful packages
\usepackage{listings}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[colorlinks=true, allcolors=blue]{hyperref}
\definecolor{lightgrey}{rgb}{0.9, 0.9, 0.9}
\lstset{ %
backgroundcolor=\color{lightgrey}}
\title{Distributed Behavior in a Fluid Analogy Architecture}
\author{Lucas Saldyt, Alexandre Linhares}
\begin{document}
\maketitle
\begin{abstract}
We investigate the distributed nature of computation in a FARG architecture, Copycat.
One of the foundations of those models is the \emph{Parallel Terraced Scan}--a psychologically-plausible model that enables a system to fluidly move between different modes of processing.
Previous work has modeled decision-making under Parallel Terraced Scan by using a central variable of \emph{Temperature}.
However, it is unlikely that this design decision accurately replicates the processes in the human brain.
This paper proposes several changes to copycat architectures that will increase their modeling accuracy.
\end{abstract}
\section{Introduction}
This paper stems from Mitchell's (1993) and Hofstadter's \& FARG's (1995) work on the copycat program.
This project focuses on effectively simulating intelligent processes through increasingly distributed decision-making.
In the process of evaluating the distributed nature of copycat, this paper also proposes a "Normal Science" framework.
First, copycat uses a "Parallel Terraced Scan" as a humanistic inspired search algorithm.
The Parallel Terraced Scan corresponds to the psychologically-plausible behavior of briefly browsing, say, a book, and delving deeper whenever something sparks one's interest.
In a way, it is a mix between a depth-first and breadth-first search.
This type of behavior seems to very fluidly change the intensity of an activity based on local, contextual cues.
Previous FARG models use centralized structures, like the global temperature value, to control the behavior of the Parallel Terraced Scan.
This paper explores how to maintain the same behavior while distributing decision-making throughout the system.
Specifically, this paper attempts different refactors of the copycat architecture.
First, the probability adjustment formulas based on temperature are changed.
Then, we experiment with two methods for replacing temperature with a distributed metric.
Initially, temperature is removed destructively, essentially removing any lines of code that mention it, simply to see what effect it has.
Then, a surgical removal of temperature is attempted, leaving in tact affected structures or replacing them by effective distributed mechanisms.
To evaluate the distributed nature of copycat, this paper focuses on the creation of a `normal science' framework.
By `Normal science,' this paper means the term created by Thomas Kuhn--the collaborative enterprise of furthering understanding within a paradigm.
Today, "normal science" is simply not done on FARG architectures (and on most computational cognitive architectures too... see Addyman \& French 2012).
Unlike mathematical theories or experiments, which can be replicated by following the materials and methods, computational models generally have dozens of particularly tuned variables, undocumented procedures, multiple assumptions about the users computational environment, etc.
It then becomes close to impossible to reproduce a result, or to test some new idea scientifically.
This paper focuses on the introduction of statistical techniques, reduction of "magic numbers", improvement and documentation of formulas, and proposals for statistical human comparison.
We also discuss, in general, the nature of the brain as a distributed system.
While the removal of a single global variable may initially seem trivial, one must realize that copycat and other cognitive architectures have many central structures.
This paper explores the justification of these central structures in general.
Is it possible to model intelligence with them, or are they harmful?
\section{Theory}
\subsection{Notes}
According to the differences we can enumerate between brains and computers, it is clear that, since computers are universal and have vastly improved in the past five decades, that computers are capable of simulating intelligent processes.
[Cite Von Neumann].
Primarily, the main obstacle now lies in our comprehension of intelligent processes.
Once we truly understand the brain, writing software that emulates intelligence will be a relatively simple software engineering task.
However, we must be careful to remain true to what we already know about intelligent processes so that we may come closer to learning more about them and eventually replicating them in full.
The largest difference between the computer and the brain is the distributed nature of computation.
Specifically, our computers as they exist today have central processing units, where literally all of computation happens.
On the other hand, our brains have no central location where all processing happens.
Luckily, the speed advantage and universality of computers makes it possible to simulate the distributed behavior of the brain.
However, this simulation is only possible if computers are programmed with concern for the distributed nature of the brain.
[Actually, I go back and forth on this: global variables might be plausible, but likely aren't]
Also, even though the brain is distributed, some clustered processes must take place.
In general, centralized structures should be removed from the copycat software, because they will likely improve the accuracy of simulating intelligent processes.
It isn't clear to what degree this refactor should take place.
The easiest target is the central variable, temperature, but other central structures exist.
This paper focuses primarily on temperature, and the unwanted global unification associated with it.
Even though copycat uses simulated parallel code, if copycat were actually parallelized, the global variable of temperature would actually prevent most copycat codelets from running at the same time.
If this global variable and other constricting centralized structures were removed, copycat's code would more closely replicate intelligent processes and would be able to be run much faster.
From a function-programming like perspective (i.e. LISP, the original language of copycat), the brain should simply be carrying out the same function in many locations (i.e. mapping neuron.process() across each of its neurons, if you will...)
Note that this is more similar to the behavior of a GPU than a CPU....?
However, in violating this model with the introduction of global variables......
Global variables seem like a construct that people use to model the real world.
...
It is entirely possible that at the level of abstraction that copycat uses, global variables are perfectly acceptable.
For example, a quick grep-search of copycat shows that the workspace singleton also exists as a global variable.
Making all of copycat distributed clearly would require a full rewrite of the software....
If copycat can be run such that codelets may actually execute at the same time (without pausing to access globals), then it will much better replicate the human brain.
However, I question the assumption that the human brain has absolutely no centralized processing.
For example, input and output channels (i.e. speech mechanisms) are not accessible from the entire brain.
Also, brain-region science leads me to believe that some (for example, research concerning wernicke's or broca's areas) brain regions truly are "specialized," and thus lend some support to the existence of centralized structures in a computer model of the brain.
However, these centralized structures may be emergent?
So, to re-iterate: Two possibilities exist (hypotheses)
A computer model of the brain can contain centralized structures and still be effective in its modeling.
A computer model cannot have any centralzied structures if it is going to be effective in its modeling.
Another important problem is defining the word "effective".
I suppose that "effective" would mean capable of solving fluid analogy problems, producing similar answers to an identically biased human.
However, it isn't clear to me that removing temperature increases the ability to solve problems effectively.
Is this because models are allowed to have centralized structures, or because temperature isn't the only centralized structure?
Clearly, creating a model of copycat that doesn't have centralized structures will take an excessive amount of effort.
\break
The calculation for temperature in the first place is extremely convoluted (in the Python version of copycat).
It lacks any documentation, is full of magic numbers, and contains seemingly arbitrary conditionals.
(If I submitted this as a homework assignment, I would probably get a C. Lol)
Edit: Actually, the lisp version of copycat does a very good job of documenting magic numbers and procedures.
My main complaint is that this hasn't been translated into the Python version of copycat.
However, the Python version is translated from the Java version..
Lost in translation.
My goal isn't to roast copycat's code, however.
Instead, what I see is that all this convolution is \emph{unnecessary}.
Ideally, a future version of copycat, or an underlying FARG architecure will remove this convolution, and make temperature calculation simpler, streamlined, documented, understandble.
How will this happen, though?
A global description of the system is, at times, potentially useful.
However, in summing together the values of each workspace object, information is lost regarding which workspace objects are offending.
In general, the changes that occur will eventually be object-specific.
So, it seems to me that going from object-specific descriptions to a global description back to an object-specific action is a waste of time.
I don't think that a global description should be \emph{obliterated} (removed 100\%).
I just think that a global description should be reserved for when global actions are taking place.
For example, when deciding that copycat has found a satisfactory answer, a global description should be used, because deciding to stop copycat is a global action.
However, when deciding to remove a particular structure, a global description should not be used, because removing a particular offending structure is NOT a global action.
Summary: it is silly to use global information to make local decisions that would be better made using local information (self-evident).
Benefits of using local information to make local decisions:
Code can be truly distributed, running in true parallel, CPU-bound.
This means that copycat would be faster and more like a human brain.
Specific structures would be removed based on their own offenses.
This means that relvant structures would remain untouched, which would be great!
Likely, this change to copycat would produce better answer distributions testable through the normal science framework.
On the other hand (I've never met a one-handed researcher), global description has some benefits.
For example, the global formula for temperature converts the raw importance value for each object into a relative importance value for each object.
If a distributed metric was used, this importance value would have to be left in its raw form.
\break
The original copycat was written in LISP, a mixed-paradigm language.
Because of LISP's preference for functional code, global variables must be explicitly marked with surrounding asterisks.
Temperature, the workspace, and final answers are all marked global variables as discussed in this paper.
These aspects of copycat are all - by definition - impure, and therefore imperative code that relies on central state changes.
It is clear that, since imperative, mutation-focused languages (like Python) are turing complete in the same way that functional, purity-focused languages (like Haskell) are turing complete, each method is clearly capable of modeling the human brain.
However, the algorithm run by the brain is more similar to distributed, parallel functional code than it is to centralized, serial imperative code.
While there is some centralization in the brain, and evidently some state changes, it is clear that 100\% centralized 100\% serial code is not a good model of the brain.
Also, temperature is, ultimately, just a function of objects in the global workspace.
The git branch soft-temp-removal hard-removes most usages of temperature, but continues to use a functional version of the temperature calculation for certain processes, like determining if the given answer is satisfactory or not.
So, all mentions of temperature could theoretically be removed and replaced with a dynamic calculation of temperature instead.
It is clear that in this case, this change is unnecessary.
With the goal of creating a distributed model in mind, what actually bothers me more is the global nature of the workspace, coderack, and other singleton copycat structures.
Really, when temperature is removed and replaced with some distributed metric, it is clear that the true "offending" global is the workspace/coderack.
Alternatively, codelets could be equated to ants in an anthill (see anthill analogy in GEB).
Instead of querying a global structure, codelets could query their neighbors, the same way that ants query their neighbors (rather than, say, relying on instructions from their queen).
Biological or psychological plausibility only matters if it actually affects the presence of intelligent processes. For example, neurons don't exist in copycat because we feel that they aren't required to simulate the processes being studied. Instead, copycat uses higher-level structures to simulate the same emergent processes that neurons do. However, codelets and the control of them relies on a global function representing tolerance to irrelevant structures. Other higher level structures in copycat likely rely on globals as well. Another central variable in copycat is the "rule" structure, of which there is only one. While some global variables might be viable, others may actually obstruct the ability to model intelligent processes. For example, a distributed notion of temperature will not only increase biological and psychological plausibility, but increase copycat's effectiveness at producing acceptable answer distributions.
We must also realize that copycat is only a model, so even if we take goals (level of abstraction) and biological plausibility into account...
It is only worth changing temperature if it affects the model.
Arguably, it does affect the model. (Or, rather, we hypothesize that it does. There is only one way to find out for sure, and that's the point of this paper)
So, maybe this is a paper about goals, model accuracy, and an attempt to find which cognitive details matter and which don't. It also might provide some insight into making a "Normal Science" framework.
Copycat is full of random uncommented parameters and formulas. Personally, I would advocate for removing or at least documenting as many of these as possible. In an ideal model, all of the numbers present might be either from existing mathematical formulas, or present for a very good (emergent and explainable - so that no other number would make sense in the same place) reason. However, settling on so called "magic" numbers because the authors of the program believed that their parameterizations were correct is very dangerous. If we removed random magic numbers, we would gain confidence in our model, progress towards a normal science, and gain a better understanding of cognitive processes.
Similarly, a lot of the testing of copycat is based on human perception of answer distributions. However, I suggest that we move to a more statistical approach. For example, deciding on some arbitrary baseline answer distribution and then modifying copycat to obtain other answer distributions and then comparing distributions with a statistical significance test would actually be indicative of what effect each change had. This paper will include code changes and proposals that lead copycat (and FARG projects in general) to a more statistical and verifiable approach.
While there is a good argument about copycat representing an individual with biases and therefore being incomparable to a distributed group of individuals, I believe that additional effort should be made to test copycat against human subjects. I may include in this paper a concrete proposal on how such an experiment might be done.
Let's simply test the hypothesis: \[H_i\] Copycat will have an improved (significantly different with increased frequencies of more desirable answers and decreased frequencies of less desirable answers: desirability will be determined by some concrete metric, such as the number of relationships that are preserved or mirrored) answer distribution if temperature is turned to a set of distributed metrics. \[H_0\] Copycat's answer distribution will be unaffected by changing temperature to a set of distributed metrics.
\subsection{Normal Science}
\subsubsection{Scientific Style}
The Python3 version of copycat contains many undocumented formulas and magic numbers.
Also, because of the random nature of copycat, sometimes answer distributions can be affected by the computer architecture that the software is being executed on.
To avoid this, this paper suggests documentation of formulas, removal or clear justification of magic numbers, and the use of seeding to get around random processes.
Additionally, I might discuss how randomness doesn't *really* exist.
Because of this, maybe the explicit psuedo-random nature of Copycat shouldn't exist?
Instead.. The distributed nature of computation might act as a psuedo-random process in and of itself.
\subsubsection{Scientific Testing}
Previously, no statistical tests have been done with the copycat software.
Copycat can be treated like a black box, where, when given a particular problem, copycat produces a distribution of answers as output.
In this perspective, copycat can be tweaked, and then output distributions on the same problem can be compared with a statistical test, like a $\chi^2$ test.
The $\chi^2$ value indicates the degree to which a new copycat distribution differs from an old one.
So, a $\chi^2$ test is useful both as a unit test and as a form of scientific inquiry.
For example, if a new feature is added to copycat (say, the features included in the Metcat software), then the new distributions can be compared to the distributions produced by the original version of copycat.
Ideally, these distributions will differ, giving us a binary indication of whether the changes to the software actually had any effect.
Then, once we know that a distribution is statistically novel, we can decide on metrics that evaluate its effectiveness in solving the given problem.
For example, since Metacat claims to solve the "xyz" problem, and "wyz" is generally seen as the best answer to the "xyz" problem, a metric that evaluates the health of a distribution might simply be the percentage of "wyz" answers.
This can be generalized to the percentage of desirable answers given by some copycat variant in general.
Another metric might be the inverse percentage of undesirable answers.
For example, "xyd" is an undesirable answer to the "xyz" problem.
So, if Metacat produced large quantities of "xyd," it would be worse than the legacy copycat.
However, the legacy copycat produces large quantities of "xyd" and small quantities of "wyz".
Given these two discussed metrics, it would be clear that, through our normal science framework, Metacat is superior at solving the "xyz" problem.
Ideally, this framework can be applied to other copycat variants and on other problems.
Through the lens of this framework, copycat can be evaluated scientifically.
\subsection{Distribution}
\subsubsection{Von Neumann Discussion}
An objective, scientifically oriented framework is essential to making progress in the domain of cognitive science.
[John Von Neumann: The Computer and the Brain?
He pointed out that there were good grounds merely in terms of electrical analysis to show that the mind, the brain itself, could not be working on a digital system. It did not have enough accuracy; or... it did not have enough memory. ...And he wrote some classical sentences saying there is a statistical language in the brain... different from any other statistical language that we use... this is what we have to discover. ...I think we shall make some progress along the lines of looking for what kind of statistical language would work.]
Notion that the brain obeys statistical, entropical mathematics
\subsubsection{Turing Completeness}
In a nutshell, because computers are turing complete, it is clear that they can simulate the human brain, given enough power/time.
\subsubsection{Simulation of Distributed Processes}
Despite the ability of computers to simulate the human brain, simulation may not always be accurate unless programmed to be accurate...
\subsubsection{Efficiency of True Distribution}
\subsubsection{Temperature in Copycat}
\subsubsection{Other Centralizers in Copycat}
\subsubsection{The Motivation for Removing Centralizers in Copycat}
\section{Methods}
\subsection{Formula Adjustments}
\subsubsection{Temperature Probability Adjustment}
This research begin with adjustments to probability weighting formulas.
In copycat, temperature affects the simulation in multiple ways:
\begin{enumerate}
\item Certain codelets are probabalistically chosen to run
\item Certain structures are probabalistically chosen to be destroyed
\item ...
\end{enumerate}
In many cases, the formulas "get-adjusted-probability" and "get-adjusted-value" are used.
Each curves a probability as a function of temperature.
The desired behavior is as follows:
At high temperatures, the system should explore options that would otherwise be unlikely.
So, at temperatures above half of the maximum temperature, probabilities with a base value less than fifty percent will be curved higher, to some threshold.
At temperatures below half of the maximum temperature, probabilities with a base value above fifty percent will be curved lower, to some threshold.
The original formulas being used to do this were overly complicated.
In summary, many formulas were tested in a spreadsheet, and an optimal one was chosen that replicated the desired behavior.
The original formula for curving probabilties in copycat:
\lstinputlisting[language=Python]{formulas/original.py}
An alternative that seems to improve performance on the abd->abd xyz->? problem:
This formula produces probabilities that are not bounded between 0 and 1. These are generally truncated.
\lstinputlisting[language=Python]{formulas/entropy.py}
Ultimately, it wasn't clear to me that the so-called "xyz" problem should even be considered.
As discussed in [the literature], the "xyz" problem is a novel example of a cognitive obstacle.
Generally, the best techniques for solving the "xyz" problem are discussed in the the publications around the "Metacat" project, which gives copycat a temporary memory and levels of reflection upon its actions.
However, it is possible that the formula changes that target improvement in other problems may produce better results for the "xyz" problem.
Focusing on the "xyz" problem, however, will likely be harmful to the improvement of performanace on other problems.
So, the original copycat formula is overly complicated, and doesn't perform optimally on several problems.
The entropy formula is an improvement, but other formulas are possible too.
Below are variations on a "weighted" formula.
The general structure is:
\[\emph{p'} = \frac{T}{100} * S + \frac{100-T}{100} * U\]
Where: $S$ is the convergence value for when $T = 0$ and
$U$ is the convergence value for when $T = 100$.
The below formulas simply experiment with different values for $S$ and $U$
The values of $\alpha$ and $\beta$ can be used to provide additional weighting for the formula, but are not used in this section.
\lstinputlisting[language=Python]{formulas/weighted.py}
[Discuss inverse formula and why $S$ was chosen to be constant]
After some experimentation and reading the original copycat documentation, it was clear that $S$ should be chosen to be $0.5$ and that $U$ should implement the probability curving desired at high temperatures.
The following formulas let $U = p^r$ if $p < 0.5$ and let $U = p^\frac{1}{r}$ if $p >= 0.5$.
This controls whether/when curving happens.
Now, the parameter $r$ simply controls the degree to which curving happens.
Different values of $r$ were experimented with (values between $10$ and $1$ were experimented with at increasingly smaller step sizes).
$2$ and $1.05$ are both good choices at opposite "extremes".
$2$ works because it is large enough to produce novel changes in behavior at extreme temperatures without totally disregarding the original probabilities.
Values above $2$ do not work because they make probabilities too uniform.
Values below $2$ (and above $1.05$) are feasible, but produce less curving and therefore less unique behavior.
$1.05$ works because it very closely replicates the original copycat formulas, providing a very smooth curving.
Values beneath $1.05$ essentially leave probabilities unaffected, producing no significant unique behavior dependent on temperature.
\lstinputlisting[language=Python]{formulas/best.py}
Random thought:
It would be interesting to not hardcode the value of $r$, but to instead leave it as a variable between $0$ and $2$ that changes depending on frustration.
However, this would be much like temperature in the first place....?
$r$ could itself be a function of temperature. That would be.... meta.... lol.
\break
...
\break
And ten minutes later, it was done.
The "meta" formula performs as well as the "best" formula on the "ijjkkk" problem, which I consider the most novel.
Interestingly, I noticed that the paramterized formulas aren't as good on this problem. What did I parameterize them for? Was it well justified?
(Probably not)
At this point, I plan on using the git branch "feature-normal-science-framework" to implement a system that takes in a problem set and provides several answer distributions as output.
Then, I'll do a massive cross-formula answer distribution comparison with $\chi^2$ tests. This will give me an idea about which formula and which changes are best.
I'll also be able to compare all of these answer distributions to the frequencies obtained in temperature removal branches of the repository.
\subsubsection{Temperature Calculation Adjustment}
\subsubsection{Temperature Usage Adjustment}
\subsection{$\chi^2$ Distribution Testing}
\section{Results}
\subsection{$\chi^2$ Table}
\section{Discussion}
\subsection{Distributed Computation Accuracy}
\subsection{Prediction}
\bibliographystyle{alpha}
\bibliography{sample}
\end{document}

93
papers/draft2.tex Normal file
View File

@ -0,0 +1,93 @@
\documentclass[a4paper]{article}
%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
%% Sets page size and margins
\usepackage[a4paper,top=3cm,bottom=2cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry}
%% Useful packages
\usepackage{listings}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[colorlinks=true, allcolors=blue]{hyperref}
\definecolor{lightgrey}{rgb}{0.9, 0.9, 0.9}
\lstset{ %
backgroundcolor=\color{lightgrey}}
\title{Distributed Behavior in a Fluid Analogy Architecture}
\author{Lucas Saldyt, Alexandre Linhares}
\begin{document}
\maketitle
\begin{abstract}
\end{abstract}
\section{Introduction}
%% This paper stems from Mitchell's (1993) and Hofstadter's \& FARG's (1995) work on the copycat program.
%% This project focuses on effectively simulating intelligent processes through increasingly distributed decision-making.
%% In the process of evaluating the distributed nature of copycat, this paper also proposes a "Normal Science" framework.
%%
%% First, copycat uses a "Parallel Terraced Scan" as a humanistic inspired search algorithm.
%% The Parallel Terraced Scan corresponds to the psychologically-plausible behavior of briefly browsing, say, a book, and delving deeper whenever something sparks one's interest.
%% In a way, it is a mix between a depth-first and breadth-first search.
%% This type of behavior seems to very fluidly change the intensity of an activity based on local, contextual cues.
%% Previous FARG models use centralized structures, like the global temperature value, to control the behavior of the Parallel Terraced Scan.
%% This paper explores how to maintain the same behavior while distributing decision-making throughout the system.
%%
%% Specifically, this paper attempts different refactors of the copycat architecture.
%% First, the probability adjustment formulas based on temperature are changed.
%% Then, we experiment with two methods for replacing temperature with a distributed metric.
%% Initially, temperature is removed destructively, essentially removing any lines of code that mention it, simply to see what effect it has.
%% Then, a surgical removal of temperature is attempted, leaving in tact affected structures or replacing them by effective distributed mechanisms.
%%
%% To evaluate the distributed nature of copycat, this paper focuses on the creation of a `normal science' framework.
%% By `Normal science,' this paper means the term created by Thomas Kuhn--the collaborative enterprise of furthering understanding within a paradigm.
%% Today, "normal science" is simply not done on FARG architectures (and on most computational cognitive architectures too... see Addyman \& French 2012).
%% Unlike mathematical theories or experiments, which can be replicated by following the materials and methods, computational models generally have dozens of particularly tuned variables, undocumented procedures, multiple assumptions about the users computational environment, etc.
%% It then becomes close to impossible to reproduce a result, or to test some new idea scientifically.
%% This paper focuses on the introduction of statistical techniques, reduction of "magic numbers", improvement and documentation of formulas, and proposals for statistical human comparison.
%%
%% We also discuss, in general, the nature of the brain as a distributed system.
%% While the removal of a single global variable may initially seem trivial, one must realize that copycat and other cognitive architectures have many central structures.
%% This paper explores the justification of these central structures in general.
%% Is it possible to model intelligence with them, or are they harmful?
%% {Von Neumann Discussion}
%% {Turing Completeness}
%% {Simulation of Distributed Processes}
%% {Efficiency of True Distribution}
%% {Temperature in Copycat}
%% {Other Centralizers in Copycat}
%% {The Motivation for Removing Centralizers in Copycat}
\section{Methods}
\subsection{Formula Documentation}
Many of copycat's formulas use magic numbers and marginally documented formulas.
This is less of a problem in the original LISP code, and more of a problem in the twice-translated Python3 version of copycat.
However, even in copycat's LISP implementation, formulas have redundant parameters.
For example, if given two formulas: $f(x) = 2x$ and $g(x) = x^2$, a single formula can be written $h(x) = 2x^2$ (The composed and then simplified formula).
Ideally, the adjustment formulas within copycat could be reduced in the same way, so that much of copycat's behavior rested on a handful of parameters in a single location, as opposed to more than ten parameters scattered throughout the repository.
Also, often parameters in copycat have little statistically significant effect.
As will be discussed in the $\chi^2$ distribution testing section, any copycat formulas without a significant effect will be hard-removed.
\subsection{Testing the Effect of Temperature}
To begin with, the existing effect of the centralizing variable, temperature, will be analyzed.
\subsection{Temperature Probability Adjustment}
\subsection{Temperature Usage Adjustment}
\subsection{$\chi^2$ Distribution Testing}
\section{Results}
\subsection{Cross $\chi^2$ Table}
\section{Discussion}
\subsection{Distributed Computation Accuracy}
\subsection{Prediction}
\bibliographystyle{alpha}
\bibliography{sample}
\end{document}

28
papers/formulas/adj.l Normal file
View File

@ -0,0 +1,28 @@
(defun get-temperature-adjusted-probability (prob &aux low-prob-factor
result)
; This function is a filter: it inputs a value (from 0 to 100) and returns
; a probability (from 0 - 1) based on that value and the temperature. When
; the temperature is 0, the result is (/ value 100), but at higher
; temperatures, values below 50 get raised and values above 50 get lowered
; as a function of temperature.
; I think this whole formula could probably be simplified.
(setq result
(cond ((= prob 0) 0)
((<= prob .5)
(setq low-prob-factor (max 1 (truncate (abs (log prob 10)))))
(min (+ prob
(* (/ (- 10 (sqrt (fake-reciprocal *temperature*)))
100)
(- (expt 10 (- (1- low-prob-factor))) prob)))
.5))
((= prob .5) .5)
((> prob .5)
(max (- 1
(+ (- 1 prob)
(* (/ (- 10 (sqrt (fake-reciprocal *temperature*)))
100)
(- 1 (- 1 prob)))))
.5))))
result)

21
papers/formulas/best.py Normal file
View File

@ -0,0 +1,21 @@
def _working_best(temp, prob):
s = .5 # convergence
r = 1.05 # power
u = prob ** r if prob < .5 else prob ** (1/r)
return _weighted(temp, prob, s, u)
def _soft_best(temp, prob):
s = .5 # convergence
r = 1.05 # power
u = prob ** r if prob < .5 else prob ** (1/r)
return _weighted(temp, prob, s, u)
def _parameterized_best(temp, prob):
alpha = 5
beta = 1
s = .5
s = (alpha * prob + beta * s) / (alpha + beta)
r = 1.05
u = prob ** r if prob < .5 else prob ** (1/r)
return _weighted(temp, prob, s, u)

View File

@ -0,0 +1,12 @@
import math
def _entropy(temp, prob):
if prob == 0 or prob == 0.5 or temp == 0:
return prob
if prob < 0.5:
return 1.0 - _original(temp, 1.0 - prob)
coldness = 100.0 - temp
a = math.sqrt(coldness)
c = (10 - a) / 100
f = (c + 1) * prob
return -f * math.log2(f)

View File

@ -0,0 +1,12 @@
import math
def _original(temp, prob):
if prob == 0 or prob == 0.5 or temp == 0:
return prob
if prob < 0.5:
return 1.0 - _original(temp, 1.0 - prob)
coldness = 100.0 - temp
a = math.sqrt(coldness)
c = (10 - a) / 100
f = (c + 1) * prob
return max(f, 0.5)

View File

@ -0,0 +1,28 @@
def _weighted(temp, prob, s, u, alpha=1, beta=1):
weighted = (temp / 100) * s + ((100 - temp) / 100) * u
return weighted
def _weighted_inverse(temp, prob):
iprob = 1 - prob
return _weighted(temp, prob, iprob, prob)
# Uses .5 instead of 1-prob
def _fifty_converge(temp, prob):
return _weighted(temp, prob, .5, prob)
# Curves to the average of the (1-p) and .5
def _soft_curve(temp, prob):
return min(1, _weighted(temp, prob, (1.5-prob)/2, prob))
# Curves to the weighted average of the (1-p) and .5
def _weighted_soft_curve(temp, prob):
weight = 100
gamma = .5 # convergance value
alpha = 1 # gamma weight
beta = 3 # iprob weight
curved = min(1,
(temp / weight) *
((alpha * gamma + beta * (1 - prob)) /
(alpha + beta)) +
((weight - temp) / weight) * prob)
return curved

292
papers/legacy.tex Normal file
View File

@ -0,0 +1,292 @@
\section{LSaldyt: Brainstorm, Planning, and Outline}
\subsection{Steps/plan}
Normal Science:
\begin{enumerate}
\item Introduce statistical techniques
\item Reduce magic number usage, document reasoning and math
\item Propose effective human subject comparison
\end{enumerate}
Temperature:
\begin{enumerate}
\item Propose formula improvements
\item Experiment with a destructive removal of temperature
\item Experiment with a "surgical" removal of temperature
\item Assess different copycat versions with/without temperature
\end{enumerate}
\subsection{Semi-structured Notes}
Biological or psychological plausibility only matters if it actually affects the presence of intelligent processes. For example, neurons don't exist in copycat because we feel that they aren't required to simulate the processes being studied. Instead, copycat uses higher-level structures to simulate the same emergent processes that neurons do. However, codelets and the control of them relies on a global function representing tolerance to irrelevant structures. Other higher level structures in copycat likely rely on globals as well. Another central variable in copycat is the "rule" structure, of which there is only one. While some global variables might be viable, others may actually obstruct the ability to model intelligent processes. For example, a distributed notion of temperature will not only increase biological and psychological plausibility, but increase copycat's effectiveness at producing acceptable answer distributions.
We must also realize that copycat is only a model, so even if we take goals (level of abstraction) and biological plausibility into account...
It is only worth changing temperature if it affects the model.
Arguably, it does affect the model. (Or, rather, we hypothesize that it does. There is only one way to find out for sure, and that's the point of this paper)
So, maybe this is a paper about goals, model accuracy, and an attempt to find which cognitive details matter and which don't. It also might provide some insight into making a "Normal Science" framework.
Copycat is full of random uncommented parameters and formulas. Personally, I would advocate for removing or at least documenting as many of these as possible. In an ideal model, all of the numbers present might be either from existing mathematical formulas, or present for a very good (emergent and explainable - so that no other number would make sense in the same place) reason. However, settling on so called "magic" numbers because the authors of the program believed that their parameterizations were correct is very dangerous. If we removed random magic numbers, we would gain confidence in our model, progress towards a normal science, and gain a better understanding of cognitive processes.
Similarly, a lot of the testing of copycat is based on human perception of answer distributions. However, I suggest that we move to a more statistical approach. For example, deciding on some arbitrary baseline answer distribution and then modifying copycat to obtain other answer distributions and then comparing distributions with a statistical significance test would actually be indicative of what effect each change had. This paper will include code changes and proposals that lead copycat (and FARG projects in general) to a more statistical and verifiable approach.
While there is a good argument about copycat representing an individual with biases and therefore being incomparable to a distributed group of individuals, I believe that additional effort should be made to test copycat against human subjects. I may include in this paper a concrete proposal on how such an experiment might be done.
Let's simply test the hypothesis: \[H_i\] Copycat will have an improved (significantly different with increased frequencies of more desirable answers and decreased frequencies of less desirable answers: desirability will be determined by some concrete metric, such as the number of relationships that are preserved or mirrored) answer distribution if temperature is turned to a set of distributed metrics. \[H_0\] Copycat's answer distribution will be unaffected by changing temperature to a set of distributed metrics.
\subsection{Random Notes}
This is all just free-flow unstructured notes. Don't take anything too seriously :).
Below are a list of relevant primary and secondary sources I am reviewing:
Biological/Psychological Plausibility:
\begin{verbatim}
http://www.cell.com/trends/cognitive-sciences/abstract/S1364-6613(16)30217-0
"There is no evidence for a single site of working memory storage."
https://ekmillerlab.mit.edu/2017/01/10/the-distributed-nature-of-working-memory/
Creativity as a distributed process (SECONDARY: Review primaries)
https://blogs.scientificamerican.com/beautiful-minds/the-real-neuroscience-of-creativity/
cognition results from the dynamic interactions of distributed brain areas operating in large-scale networks
http://scottbarrykaufman.com/wp-content/uploads/2013/08/Bressler_Large-Scale_Brain_10.pdf
MIT Encyclopedia of the Cognitive Sciences:
In reference to connectionist models:
"Advantages of distribution are generally held to include greater representational capacity, content addressability, automatic generalization, fault tolerance, and biological plausibility. Disadvantages include slow learning, catastrophic interference, and binding problems."
Cites:
French, R. (1992). Semi-distributed representation and catastrophic forgetting in connectionist networks.
Smolensky, P. (1991). Connectionism, constituency, and the language of thought.
[...]
\end{verbatim}
(Sure, we know that the brain is a distributed system, but citing some neuroscience makes me feel much safer.)
Goal related sources:
\begin{verbatim}
This will all most likely be FARG related stuff
Isolating and enumerating FARG's goals will help show me what direction to take
[..]
\end{verbatim}
Eliminating global variables might create a program that is more psychologically and biologically plausible, as according to the above. But our goals should be kept in mind. If we wanted full psychological and biological plausibility, we would just replicate a human mind atom for atom, particle for particle, or string for string.
Levels of abstraction in modeling the human brain and its processes:
\begin{enumerate}
\item Cloning a brain at the smallest scale possible (i.e. preserving quantum states of electrons or something)
\item Simulating true neurons, abstracting away quantum mechanical detail
\item Artificial neurons that abstract away electrochemical detail
\item Simulation of higher-level brain structures and behaviors that transcends individual neurons
...
\item Highest level of abstraction that still produces intelligent processes
\end{enumerate}
How far do we plan to go? What are we even abstracting? Which details matter and which don't?
One side: Abstraction from biological detail may eventually mean that global variables become plausible
Alt: Abstraction may remove some features and not others. Global variables may \emph{never} be plausible, even at the highest level of abstraction. (Of course, this extreme is probably not the case).
Lack of a centralized structure versus lack of a global phenomena
Since temperature, for example, is really just a function of several local phenomena, how global is it? I mean: If a centralized decision maker queried local phenomena separately, and made decisions based on that, it would be the same. Maybe centralized decision makers don't exist. Decisions, while seemingly central, have to emerge from agent processes. But what level of abstraction are we working on?
Clearly, if we knew 100\% which details mattered, we would already have an effective architecture.
\section{A formalization of the model}
Let $\Omega = \{\omega_1, \omega_2, ..., \omega_n\}$ be a finite discrete space. In FARG models $\Omega$ represents the \emph{working short-term memory} of the system and the goal is to craft a context-sensitive representation (cite FRENCH here). Hence $\Omega$ holds \emph{all possible configurations} of objects that could possibly exist in one's working memory; a large space.
Let us define the neighborhood function $A:(\Omega,$C$) \to 2^\Omega$ as the set of \emph{perceived affordances} under \emph{context} $C$. The affordances $A$ define which state transitions $\omega_i \to \omega_j$ are possible at a particular context $C$. Another term that has been used in the complexity literature is \emph{the adjacent possible}.
A context is defined by the high-level ideas, the concepts that are active at a particular point in time.
The \emph{Cohesion} of the system is measured by the mutual information between the external memory, the short-term memory state $\omega_i$, and the context $C$.
\subsection{Copycat}
% LUCAS: this entire section is copies from my old "minds and machines" paper... so we should discuss at some point whether to re-write it or not.
\subsubsection{The letter-analogy domain}
Consider the following, seemingly trivial, analogy problem: $abc \to abd:ijk \to ?$, that is, if the letter string “abc” changes to the letter string “abd”, how would the letter string “ijk” change “in the same way”? This is the domain of the Copycat project, and before we attempt a full description of the system, let us discuss in more detail some of the underlying intricacies. Most people will in this case come up with a rule of transformation that looks like: “Replace the rightmost letter by its successor in the alphabet”, the application of which would lead to $ijl$. This is a simple and straightforward example. But other examples bring us the full subtlety of this domain. The reader unfamiliar with the Copycat project is invited to consider the following problems: $abc\to abd: ijjkkk?\to $, $abc\to abd: xyz\to ?$, $abc\to abd: mrrkkk\to ?$, among others (Mitchell, 2003) to have a sense of the myriad of subtle intuitions involved in solving these problems.
To solve this type of problem, one could come up with a scheme where the computer must first find a representation that models the change and then apply that change to the new string. This natural sequence of operations is \emph{not possible}, however, because \emph{the transformation rule representing the change itself must bend to contextual cues and adapt to the particularities of the letter strings}. For example, in the problem $abc\to abd: xyz\to ?$, the system may at first find a rule like “change rightmost letter to its successor in the alphabet”. However, this explicit rule cannot be carried out in this case, simply because $z$ has no successor. This leads to an impasse, out of which the only alternative by the system is to use a flexible, context-sensitive, representation system.
The reader may have noticed that this cognitive processing bears some similarities to the process of chess perception. Perception obviously plays a significant role in letter string analogies, as it is necessary to connect a set of individual units--in this case, letter sequences--, into a meaningful interpretation which stresses the underlying pressures of the analogy. In chess it is also necessary to connect disparate pieces into a meaningful description stressing the positions pressures. But the most striking similarities with chess perception (in what concerns bounded rationality) seems to be the absolute lack of a single objectively correct answer, we have instead just an intuitive subjective feeling, given by the great number of simultaneous pressures arising in each problem.
In the previous section we have made reference to some studies considering multiple, incompatible chunks that emerge in chess positions. In letter strings this same problem appears. Consider for instance the following problem:
If $aabc\to aabd: ijkk?$
\begin{itemize}
\item One may chunk the initial strings as $(a)(abc)$ and $(a)(abd)$ and find a `corresponding chunk $(ijk)(k)$, which could lead to the following transformation rule: “change the last letter of the increasing sequence to its successor in the alphabet”. This interpretation would lead to the answer $ijlk$.
\item Or, alternatively, one may chunk the initial strings as $(aa)(b)(c)$ and $(aa)(b)(d)$ and find a counterpart string with the chunking $(i)(j)(kk)$, and, in this case, the mapping can be inverted: The first letter group $(aa)$ maps to the last letter group $(kk)$, and this will also invert the other mappings, leading to $(b)$ mapping to $(j)$ and $(c)$ mapping to $(i)$. Because this viewpoint substantially stresses the concept `opposite, Copycat is able to create the transformation rule “change the first letter to its predecessor in the alphabet”, leading to the solution $hjkk$, which preserves symmetry between group letter sizes and between successorship and predecessorship relations.
\item Other potential transformation rules could lead, in this problem, to $ijkl$ (change the last letter to its successor in the alphabet), $ijll$ (change the last group of letters to its successor in the alphabet), or $jjkk$ (change the first letter to its successor in the alphabet). This problem of many incompatible (and overlapping) chunkings is of importance. The specific chunking of a problem is directly linked to its solution, because chunks stress what is important on the underlying relations.
\end{itemize}
\subsubsection{The FARG architecture of Copycat}
How does the Copycat system work? Before reviewing its underlying parts, let us bear in mind one of its principal philosophical points. Copycat is not intended solely as a letter-string analogy program. The intention of the project is the test of a theory; a theory of `statistically emergent active symbols (Hofstadter 1979; Hofstadter 1985) which is diametrically opposite to the “symbol system hypothesis” (Newell, 1980; Simon, 1980). The major idea of active symbols is that instead of being tokens passively manipulated by programs, active symbols emerge from high numbers of interdependent subcognitive processes, which swarm over the system and drive its processing by triggering a complex `chain reaction of concepts. The system is termed `subsymbolic because these processes are intended to correspond to subliminal human information processes of few milliseconds, such as a subtle activation of a concept (i.e., priming), or an unconscious urge to look for a particular object. So the models are of collective (or emergent) computation, where a multitude of local processes gradually build a context-sensitive representation of the problem. These symbols are active because they drive processing, leading a chain reaction of activation spreading, in which active concepts continuously trigger related concepts, and short-term memory structures are construed to represent the symbol (in this philosophical view a token does not have any associated meaning, while a meaningful representation, a symbol, emerges from an interlocked interpretation of many subcognitive pressing urges).
This cognitively plausible architecture has been applied to numerous domains (see for instance French 1992; Mitchell and Hofstadter 1990; Mitchell 1993; McGraw 1995; Marshall 1999; Rehling 2001 MANY ARE MISSING HERE!). It has five principal components:
\begin{enumerate}
\item A workspace that interacts with external memory--this is the working short-term memory of the model. The workspace is where the representations are construed, with innumerable pressing urges waiting for attention and their corresponding impulsive processes swarming over the representation, independently perceiving and creating many types of subpatterns. Common examples of such subpatterns are bonds between letters such as group bonds between $a*a$ or successor bonds between successive letters $a*b$ , or relations between objects, awareness of abstract roles played by objects, and so on.
\item Pressing urges and impulsive processes The computational processes constructing the representations on short-term memory are subcognitive impulsive processes named codelets. The system perceives a great number of subtle pressures that immediately invoke subcognitive urges to handle them. These urges will eventually become impulsive processes. Some of these impulsive processes may look for particular objects, some may look for particular relations between objects and create bonds between them, some may group objects into chunks, or associate descriptions to objects, etc. The collective computation of these impulsive processes, at any given time, stands for the working memory of the model. These processes can be described as impulsive for a number of reasons: first of all, they are involuntary, as there is no conscious decision required for their triggering. (As Daniel Dennett once put it, if I ask you “not to think of an elephant”, it is too late, you already have done so, in an involuntary way.) They are also automatic, as there is no need for conscious decisions to be taken in their internal processing; they simply know how to do their job without asking for help. They are fast, with only a few operations carried out. They accomplish direct connections between their micro-perceptions and their micro-actions. Processing is also granular and fragmented as opposed to a linearly structured sequence of operations that cannot be interrupted (Linhares 2003). Finally, they are functional, associated with a subpattern, and operate on a subsymbolic level (but not restricted to the manipulation of internal numerical parameters as opposed to most connectionist systems).
\item List of parallel priorities— Each impulsive process executes a local, incremental, change to the emerging representation, but the philosophy of the system is that all pressing urges are perceived simultaneously, in parallel. So there is at any point in time a list of subcognitive urges ready to execute, fighting for the attention of the system and waiting probabilistically to fire as an impulsive process. This list of parallel priorities is named in Copycat as the coderack.
\item A semantic associative network undergoing constant flux The system has very limited basic knowledge: it knows the 26 letters of the alphabet, and the immediate successorship relations entailed (it does not, for instance, know that the shapes of lowercase letters p, b, q bear some resemblance). The long-term memory of the system is embedded over a network of nodes representing concepts with links between nodes associating related concepts. This network is a crucial part for the formation of the chain reaction of conceptual activation: any specific concept, when activated, propagates activation to its related concepts, which will in turn launch top-down expectation-driven urges to look for those related concepts. This mode of computation not only enforces a context-sensitive search but also is the basis of the chain reaction of activation spreading hence the term active symbols. This network is named in Copycat as the slipnet. One of the most original features of the slipnet is the ability to “slip one concept into another”, in which analogies between concepts are made (for details see Hofstadter 1995, Mitchell 1993).
\item A temperature measure It should be obvious that the system does not zoom in immediately and directly into a faultless representation. The process of representation construction is gradual, tentative, and numerous impulsive processes are executed erroneously. At start, the system has no expectations of the content of letter strings, so it slowly wanders through many possibilities before converging on an specific interpretation, a process named the parallel terraced scan (Hofstadter 1995); and embedded within it is a control parameter of temperature that is similar in some aspects to that found in simulated annealing (Cagan and Kotovsky 1997; Hofstadter 1995). The temperature measures the global amount of disorder and misunderstanding contained in the situation. So at the beginning of the process, when no relevant information has been gathered, the temperature will be high, but it will gradually decrease as intricate relationships are perceived, first concepts are activated, the abstract roles played by letters and chunks are found; and meaning starts to emerge. Though other authors have proposed a relationship between temperature and understanding (Cagan and Kotovsky, 1997), there is still a crucial difference here (see Hofstadter 1985, 1995): unlike the simulated annealing process that has a forcedly monotonically decreasing temperature schedule, the construction of a representation for these letter strings does not necessarily get monotonically improved as time flows. As in the $abc\to abd : xyz\to ?$ problem, there are many instants when roadblocks are reached, when snags appear, and incompatible structures arise. At these moments, complexity (and entropy and confusion) grows, and so the temperature decrease is not monotonic.
Finally, temperature does not act as a control parameter dictated by the user, that is, \emph{forced} to go either down or up, but it also acts \emph{as a feedback mechanism} to the system, which may reorganize itself, accepting or rejecting changes as temperature allows. As pressing urges are perceived, their corresponding impulses eventually propose changes to working memory, to construct or to destruct structures. How do these proposed changes get accepted? Through the guidance of temperature. At start $T$ is high and the vast majority of proposed structures are built, but as it decreases it becomes increasingly more important for a proposed change to be compatible with the existing interpretation. And the system may thus focus on developing a particular viewpoint.
\end{enumerate}
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{fig4-copycat.png}
\caption{\label{fig:run-1}Copycat after 110 codelets have executed. This implementation was carried out by Scott Bolland from the University of Queensland, Australia (2003, available online).}
\end{figure}
\subsubsection{An example run}
Let us consider an example run of the Copycat system, and look at some specific steps in its processing of the problem $abc\to abd : iijjkk \to ?$
Figure \ref{fig:run-1} presents the working memory (workspace) after 110 codelets. The system at this point has not perceived much structure. It has perceived each individual letter, it has mapped the letters $c$ and $d$ between the original and target strings, and it has perceived some initial bonds between neighboring letters. Some of these bonds are sameness bonds (such as $i*i$), some are successorship bonds (such as $i*j$), and some are predecessorship bonds (such as $b*c$). In fact, there is confusion between the competing views of successorship and predecessorship relations in the string $abc$. These incompatible interpretations will occasionally compete. The system is also mapping the leftmost letter $a$ to the leftmost letter $i$.
Notice that a first chunk has been created in the group `$jj$'. Now \emph{this chunk is an individual object on its own}, capable of bonding with (and relating to) other objects. Notice also that the system has not yet perceived---and built the corresponding bond between---the two $k$'s in succession. So perception in Copycat is granular, fragmented over large numbers of small `micro-events'.
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{fig5-copycat.png}
\caption{\label{fig:run-2}Copycats working memory after the execution of 260 codelets.}
\end{figure}
After an additional 150 codelets have been executed (Figure \ref{fig:run-2}), more structure is built: we now have three group chunks perceived; and there is also less confusion in the $abc$, as a `staircase' relation is perceived: that is, the system now perceives $abc$ as a successorship group, another chunked object. Finally, an initial translation rule appears: replace letter category of rightmost letter by successor. If the system were to stop processing at this stage it would apply this rule rather crudely and obtain the answer $iijjkl$. Note that temperature is dropping as more structure is created.
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{fig6-copycat.png}
\caption{\label{fig:run-3}Copycats working memory after the execution of 280 codelets. }
\end{figure}
Let us slow down our overview a little bit and return in Figure \ref{fig:run-3} after only 20 codelets have run, to illustrate an important phenomenon: though $c$ now will map to the group $kk$, which is an important discovery, the global temperature will still be higher than that of the previous point (Figure \ref{fig:run-2}). This occurs because there is some `confusion' arising from the predecessorship bond which was found between chunks `$ii$' and `$jj$', which does not seem to fit well with all those successorship relations already perceived and with the high activation of the successorship concept. So temperature does not always drop monotonically.
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{fig7-copycat.png}
\caption{\label{fig:frog}Copycat's working memory the after execution of 415 codelets.}
\end{figure}
On the next step we can perceive two important changes: first, the system perceives some successorship relations between the groups $ii$ and $jj$ and between the groups $jj$ and $kk$, but these relations are perceived in isolation from each other. Another important discovery is that $jj$ is interpreted as being in `the middle of' $iijjkk$, which will eventually lead to its mapping to the letter $b$ in the original string.
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{fig8-copycat.png}
\caption{\label{fig:f8}Copycats working memory after the execution of 530 codelets.}
\end{figure}
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{fig9-copycat.png}
\caption{\label{fig:f9}Final solution obtained after the execution of 695 codelets.}
\end{figure}
The system finally perceives that the successorship relations between the $ii$, $jj$, and $kk$ groups are not isolated and creates a single successorship group encompassing these three sameness groups. Thus two successor groups are perceived on the workspace, and a mapping between them is built. However, a still maps to the letter $i$, instead of to the group $ii$, and $c$ still maps to the letter $k$, instead of to the group $kk$.
From this stage it still remains for the letter $a$ to map to the group $ii$ and for the letter $c$ to map to group $kk$, which will lead naturally to the translated rule ``replace letter category of rightmost group to successor'', illustrating the slipping of the concept letter to the concept group.
After 695 codelets, the system reaches the answer $iijjll$. The workspace may seem very clean and symmetric, but it has evolved from a great deal of disorder and from many microscopic `battles' between incompatible interpretations.
The most important concepts activated in this example were group and successor group. Once some sameness bonds were constructed, they rapidly activated the concept sameness group which re-inforced the search to find sameness groups, such as $kk$. Once the initial successorship bonds were created, the activation of the corresponding concept rapidly enabled the system to find other instances of successorship relations (between, for instance, the sameness groups $jj$ and $kk$). Different problems would activate other sets of concepts. For example, `$abc\to abd: xyz\to ?$ would probably activate the concept \emph{opposite}. And `$abc\to abd: mrrjjj\to ?$' would probably activate the concept length (Mitchell 1993). This rapid activation of concepts (and their top-down pressing urges), with the associated propagation of activation to related concepts, creates a chain reaction of impulsive cognition, and is the key to active symbols theory. The reader is refereed to Mitchell (1993) and to Marshall (1999) to have an idea of how the answers provided by Copycat resemble human intuition.
We may safely conclude at this point that there are many similarities between copycat and the chess perception process, including: (i) an iterative locking in process into a representation; (ii) smaller units bond and combine to form higher level, meaningfully coherent structures; (iii) the perception process is fragmented, granular, with great levels of confusion and entropy at start, but as time progresses it is able to gradually converge into a context-sensitive representation; (iv) there is a high interaction between an external memory, a limited size short term memory, and a long term memory; and (v) this interaction is done simultaneously by bottom-up and top-down processes.
\subsection{How to include Figures}
First you have to upload the image file from your computer using the upload link the project menu. Then use the includegraphics command to include it in your document. Use the figure environment and the caption command to add a number and a caption to your figure. See the code for Figure \ref{fig:frog} in this section for an example.
\subsection{How to add Comments}
Comments can be added to your project by clicking on the comment icon in the toolbar above. % * <john.hammersley@gmail.com> 2016-07-03T09:54:16.211Z:
%
% Here's an example comment!
%
To reply to a comment, simply click the reply button in the lower right corner of the comment, and you can close them when you're done.
Comments can also be added to the margins of the compiled PDF using the todo command\todo{Here's a comment in the margin!}, as shown in the example on the right. You can also add inline comments:
\todo[inline, color=green!40]{This is an inline comment.}
\subsection{How to add Tables}
Use the table and tabular commands for basic tables --- see Table~\ref{tab:widgets}, for example.
\begin{table}
\centering
\begin{tabular}{l|r}
Item & Quantity \\\hline
Widgets & 42 \\
Gadgets & 13
\end{tabular}
\caption{\label{tab:widgets}An example table.}
\end{table}
\subsection{How to write Mathematics}
\LaTeX{} is great at typesetting mathematics. Let $X_1, X_2, \ldots, X_n$ be a sequence of independent and identically distributed random variables with $\text{E}[X_i] = \mu$ and $\text{Var}[X_i] = \sigma^2 < \infty$, and let
\[S_n = \frac{X_1 + X_2 + \cdots + X_n}{n}
= \frac{1}{n}\sum_{i}^{n} X_i\]
denote their mean. Then as $n$ approaches infinity, the random variables $\sqrt{n}(S_n - \mu)$ converge in distribution to a normal $\mathcal{N}(0, \sigma^2)$.
\subsection{How to create Sections and Subsections}
Use section and subsections to organize your document. Simply use the section and subsection buttons in the toolbar to create them, and we'll handle all the formatting and numbering automatically.
\subsection{How to add Lists}
You can make lists with automatic numbering \dots
\begin{enumerate}
\item Like this,
\item and like this.
\end{enumerate}
\dots or bullet points \dots
\begin{itemize}
\item Like this,
\item and like this.
\end{itemize}
\subsection{How to add Citations and a References List}
You can upload a \verb|.bib| file containing your BibTeX entries, created with JabRef; or import your \href{https://www.overleaf.com/blog/184}{Mendeley}, CiteULike or Zotero library as a \verb|.bib| file. You can then cite entries from it, like this: \cite{greenwade93}. Just remember to specify a bibliography style, as well as the filename of the \verb|.bib|.
You can find a \href{https://www.overleaf.com/help/97-how-to-include-a-bibliography-using-bibtex}{video tutorial here} to learn more about BibTeX.
We hope you find Overleaf useful, and please let us know if you have any feedback using the help menu above --- or use the contact form at \url{https://www.overleaf.com/contact}!

339
papers/paper.tex Normal file
View File

@ -0,0 +1,339 @@
\documentclass[a4paper]{article}
%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
%% Sets page size and margins
\usepackage[a4paper,top=3cm,bottom=2cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry}
%% Useful packages
\usepackage{listings}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[colorlinks=true, allcolors=blue]{hyperref}
\definecolor{lightgrey}{rgb}{0.9, 0.9, 0.9}
\lstset{ %
backgroundcolor=\color{lightgrey}}
\title{The Distributed Nature of Copycat..? (WIP)}
\author{Lucas Saldyt, Alexandre Linhares}
\begin{document}
\maketitle
\begin{abstract}
We investigate the distributed nature of computation in a FARG architecture, Copycat.
One of the foundations of those models is the \emph{Parallel Terraced Scan}--a psychologically-plausible model that enables a system to fluidly move between different modes of processing.
Previous work has modeled decision-making under Parallel Terraced Scan by using a central variable of \emph{Temperature}.
However, it is unlikely that this design decision accurately replicates the processes in the human brain.
Additionally, Copycat and other FARG architectures have incredible high rates of unscientific inquiry.
Specifically, Copycat uses many undocumented formulas and magic numbers, some of which have been parameterized to fix particular problems at the expense of performing worse on others.
This paper aims to add a framework for conducting so-called "Normal" science with Copycat, in the hopes of making our findings more concrete.
\end{abstract}
\section{Introduction}
This paper stems from Mitchell's (1993) and Hofstadter \& FARG (1995). The goals of this project are twofold:
Firstly, we focus on effectively simulating intelligent processes through increasingly distributed decision-making.
...
Written by Linhares:
The Parallel Terraced Scan is a major innovation of FARG architectures.
It corresponds to the psychologically-plausible behavior of briefly browsing, say, a book, and delving deeper whenever something sparks one's interest.
This type of behavior seems to very fluidly change the intensity of an activity based on local, contextual cues.
It is found in high-level decisions such as marriage and low-level decisions such as a foraging predator choosing whether to further explore a particular area.
Previous FARG models have used a central temperature T to implement this behavior.
We explore how to maintain the same behavior while distributing decision-making throughout the system.
...
Specifically, we begin by attempting different refactors of the copycat architecture.
First, we experiment with different treatments of temperature, adjusting the formulas that depend on it
Then, we experiment with two methods for replacing temperature with a distributed metric, instead.
First, we remove temperature destructively, essentially removing any lines of code that mention it, simply to see what effect it has.
Then, we move toward a surgical removal of temperature, leaving in tact affected structures or replacing them by effective distributed mechanisms.
Secondly, we focus on the creation of a `normal science' framework in FARG architectures.
By `normal science' we use the term created by Thomas Kuhn--the collaborative enterprise of furthering understanding within a paradigm.
Today, "normal science" is simply not done on FARG architectures (and on most computational cognitive architectures too... see Addyman \& French 2012).
Unlike mathematical theories or experiments, which can be replicated by following the materials and methods, computational models generally have dozens of particularly tuned variables, undocumented procedures, multiple assumptions about the users computational environment, etc.
It then becomes close to impossible to reproduce a result, or to test some new idea.
This paper focuses on the introduction of statistical techniques, reduction of "magic numbers", improvement and documentation of formulas, and proposals for effective human comparison.
We also discuss, in general, the nature of the brain as a distributed system.
While the removal of a single global variable may initially seem trivial, one must realize that copycat and other cognitive architectures have many central structures.
This paper explores the justification of these central structures in general.
Is it possible to model intelligence with them, or are they harmful?
...
\section{Body: Distributed Decision Making and Normal Science}
\subsection{Distributed Decision Making}
The distributed nature of decision making is essential to modeling intelligent processes [..]
\subsection{Normal Science}
An objective, scientifically oriented framework is essential to making progress in the domain of cognitive science.
[John Von Neumann: The Computer and the Brain?
He pointed out that there were good grounds merely in terms of electrical analysis to show that the mind, the brain itself, could not be working on a digital system. It did not have enough accuracy; or... it did not have enough memory. ...And he wrote some classical sentences saying there is a statistical language in the brain... different from any other statistical language that we use... this is what we have to discover. ...I think we shall make some progress along the lines of looking for what kind of statistical language would work.]
Notion that the brain obeys statistical, entropical mathematics
\subsection{Notes}
According to the differences we can enumerate between brains and computers, it is clear that, since computers are universal and have vastly improved in the past five decades, that computers are capable of simulating intelligent processes.
[Cite Von Neumann].
Primarily, the main obstacle now lies in our comprehension of intelligent processes.
Once we truly understand the brain, writing software that emulates intelligence will be a relatively simple software engineering task.
However, we must be careful to remain true to what we already know about intelligent processes so that we may come closer to learning more about them and eventually replicating them in full.
The largest difference between the computer and the brain is the distributed nature of computation.
Specifically, our computers as they exist today have central processing units, where literally all of computation happens.
On the other hand, our brains have no central location where all processing happens.
Luckily, the speed advantage and universality of computers makes it possible to simulate the distributed behavior of the brain.
However, this simulation is only possible if computers are programmed with concern for the distributed nature of the brain.
[Actually, I go back and forth on this: global variables might be plausible, but likely aren't]
Also, even though the brain is distributed, some clustered processes must take place.
In general, centralized structures should be removed from the copycat software, because they will likely improve the accuracy of simulating intelligent processes.
It isn't clear to what degree this refactor should take place.
The easiest target is the central variable, temperature, but other central structures exist.
This paper focuses primarily on temperature, and the unwanted global unification associated with it.
Even though copycat uses simulated parallel code, if copycat were actually parallelized, the global variable of temperature would actually prevent most copycat codelets from running at the same time.
If this global variable and other constricting centralized structures were removed, copycat's code would more closely replicate intelligent processes and would be able to be run much faster.
From a function-programming like perspective (i.e. LISP, the original language of copycat), the brain should simply be carrying out the same function in many locations (i.e. mapping neuron.process() across each of its neurons, if you will...)
However, in violating this model with the introduction of global variables......
Global variables seem like a construct that people use to model the real world.
...
It is entirely possible that at the level of abstraction that copycat uses, global variables are perfectly acceptable.
For example, a quick grep-search of copycat shows that the workspace singleton also exists as a global variable.
Making all of copycat distributed clearly would require a full rewrite of the software....
If copycat can be run such that codelets may actually execute at the same time (without pausing to access globals), then it will much better replicate the human brain.
However, I question the assumption that the human brain has absolutely no centralized processing.
For example, input and output channels (i.e. speech mechanisms) are not accessible from the entire brain.
Also, brain-region science leads me to believe that some (for example, research concerning wernicke's or broca's areas) brain regions truly are "specialized," and thus lend some support to the existence of centralized structures in a computer model of the brain.
However, these centralized structures may be emergent?
So, to re-iterate: Two possibilities exist (hypotheses)
A computer model of the brain can contain centralized structures and still be effective in its modeling.
A computer model cannot have any centralzied structures if it is going to be effective in its modeling.
Another important problem is defining the word "effective".
I suppose that "effective" would mean capable of solving fluid analogy problems, producing similar answers to an identically biased human.
However, it isn't clear to me that removing temperature increases the ability to solve problems effectively.
Is this because models are allowed to have centralized structures, or because temperature isn't the only centralized structure?
Clearly, creating a model of copycat that doesn't have centralized structures will take an excessive amount of effort.
\break
.....
\break
The calculation for temperature in the first place is extremely convoluted (in the Python version of copycat).
It lacks any documentation, is full of magic numbers, and contains seemingly arbitrary conditionals.
(If I submitted this as a homework assignment, I would probably get a C. Lol)
Edit: Actually, the lisp version of copycat does a very good job of documenting magic numbers and procedures.
My main complaint is that this hasn't been translated into the Python version of copycat.
However, the Python version is translated from the Java version..
Lost in translation.
My goal isn't to roast copycat's code, however.
Instead, what I see is that all this convolution is \emph{unnecessary}.
Ideally, a future version of copycat, or an underlying FARG architecure will remove this convolution, and make temperature calculation simpler, streamlined, documented, understandble.
How will this happen, though?
A global description of the system is, at times, potentially useful.
However, in summing together the values of each workspace object, information is lost regarding which workspace objects are offending.
In general, the changes that occur will eventually be object-specific.
So, it seems to me that going from object-specific descriptions to a global description back to an object-specific action is a waste of time.
I don't think that a global description should be \emph{obliterated} (removed 100\%).
I just think that a global description should be reserved for when global actions are taking place.
For example, when deciding that copycat has found a satisfactory answer, a global description should be used, because deciding to stop copycat is a global action.
However, when deciding to remove a particular structure, a global description should not be used, because removing a particular offending structure is NOT a global action.
Summary: it is silly to use global information to make local decisions that would be better made using local information (self-evident).
Benefits of using local information to make local decisions:
Code can be truly distributed, running in true parallel, CPU-bound.
This means that copycat would be faster and more like a human brain.
Specific structures would be removed based on their own offenses.
This means that relvant structures would remain untouched, which would be great!
Likely, this change to copycat would produce better answer distributions testable through the normal science framework.
On the other hand (I've never met a one-handed researcher), global description has some benefits.
For example, the global formula for temperature converts the raw importance value for each object into a relative importance value for each object.
If a distributed metric was used, this importance value would have to be left in its raw form.
\subsubsection{Functional Programming Languages and the Brain}
The original copycat was written in LISP, a mixed-paradigm language.
Because of LISP's preference for functional code, global variables must be explicitly marked with surrounding asterisks.
Temperature, the workspace, and final answers are all marked global variables as discussed in this paper.
These aspects of copycat are all - by definition - impure, and therefore imperative code that relies on central state changes.
It is clear that, since imperative, mutation-focused languages (like Python) are turing complete in the same way that functional, purity-focused languages (like Haskell) are turing complete, each method is clearly capable of modeling the human brain.
However, the algorithm run by the brain is more similar to distributed, parallel functional code than it is to centralized, serial imperative code.
While there is some centralization in the brain, and evidently some state changes, it is clear that 100\% centralized 100\% serial code is not a good model of the brain.
Also, temperature is, ultimately, just a function of objects in the global workspace.
The git branch soft-temp-removal hard-removes most usages of temperature, but continues to use a functional version of the temperature calculation for certain processes, like determining if the given answer is satisfactory or not.
So, all mentions of temperature could theoretically be removed and replaced with a dynamic calculation of temperature instead.
It is clear that in this case, this change is unnecessary.
With the goal of creating a distributed model in mind, what actually bothers me more is the global nature of the workspace, coderack, and other singleton copycat structures.
Really, when temperature is removed and replaced with some distributed metric, it is clear that the true "offending" global is the workspace/coderack.
Alternatively, codelets could be equated to ants in an anthill (see anthill analogy in GEB).
Instead of querying a global structure, codelets could query their neighbors, the same way that ants query their neighbors (rather than, say, relying on instructions from their queen).
\subsection{Initial Formula Adjustments}
This research begin with adjustments to probability weighting formulas.
In copycat, temperature affects the simulation in multiple ways:
\begin{enumerate}
\item Certain codelets are probabalistically chosen to run
\item Certain structures are probabalistically chosen to be destroyed
\item ...
\end{enumerate}
In many cases, the formulas "get-adjusted-probability" and "get-adjusted-value" are used.
Each curves a probability as a function of temperature.
The desired behavior is as follows:
At high temperatures, the system should explore options that would otherwise be unlikely.
So, at temperatures above half of the maximum temperature, probabilities with a base value less than fifty percent will be curved higher, to some threshold.
At temperatures below half of the maximum temperature, probabilities with a base value above fifty percent will be curved lower, to some threshold.
The original formulas being used to do this were overly complicated.
In summary, many formulas were tested in a spreadsheet, and an optimal one was chosen that replicated the desired behavior.
The original formula for curving probabilties in copycat:
\lstinputlisting[language=Python]{formulas/original.py}
An alternative that seems to improve performance on the abd->abd xyz->? problem:
This formula produces probabilities that are not bounded between 0 and 1. These are generally truncated.
\lstinputlisting[language=Python]{formulas/entropy.py}
Ultimately, it wasn't clear to me that the so-called "xyz" problem should even be considered.
As discussed in [the literature], the "xyz" problem is a novel example of a cognitive obstacle.
Generally, the best techniques for solving the "xyz" problem are discussed in the the publications around the "Metacat" project, which gives copycat a temporary memory and levels of reflection upon its actions.
However, it is possible that the formula changes that target improvement in other problems may produce better results for the "xyz" problem.
Focusing on the "xyz" problem, however, will likely be harmful to the improvement of performanace on other problems.
So, the original copycat formula is overly complicated, and doesn't perform optimally on several problems.
The entropy formula is an improvement, but other formulas are possible too.
Below are variations on a "weighted" formula.
The general structure is:
\[\emph{p'} = \frac{T}{100} * S + \frac{100-T}{100} * U\]
Where: $S$ is the convergence value for when $T = 0$ and
$U$ is the convergence value for when $T = 100$.
The below formulas simply experiment with different values for $S$ and $U$
The values of $\alpha$ and $\beta$ can be used to provide additional weighting for the formula, but are not used in this section.
\lstinputlisting[language=Python]{formulas/weighted.py}
[Discuss inverse formula and why $S$ was chosen to be constant]
After some experimentation and reading the original copycat documentation, it was clear that $S$ should be chosen to be $0.5$ and that $U$ should implement the probability curving desired at high temperatures.
The following formulas let $U = p^r$ if $p < 0.5$ and let $U = p^\frac{1}{r}$ if $p >= 0.5$.
This controls whether/when curving happens.
Now, the parameter $r$ simply controls the degree to which curving happens.
Different values of $r$ were experimented with (values between $10$ and $1$ were experimented with at increasingly smaller step sizes.
$2$ and $1.05$ are both good choices at opposite "extremes".
$2$ works because it is large enough to produce novel changes in behavior at extreme temperatures without totally disregarding the original probabilities.
Values above $2$ do not work because they make probabilities too uniform.
Values below $2$ (and above $1.05$) are feasible, but produce less curving and therefore less unique behavior.
$1.05$ works because it very closely replicates the original copycat formulas, providing a very smooth curving.
Values beneath $1.05$ essentially leave probabilities unaffected, producing no significant unique behavior dependent on temperature.
\lstinputlisting[language=Python]{formulas/best.py}
Random thought:
It would be interesting to not hardcode the value of $r$, but to instead leave it as a variable between $0$ and $2$ that changes depending on frustration.
However, this would be much like temperature in the first place....?
$r$ could itself be a function of temperature. That would be.... meta.... lol.
\break
...
\break
And ten minutes later, it was done.
The "meta" formula performs as well as the "best" formula on the "ijjkkk" problem, which I consider the most novel.
Interestingly, I noticed that the paramterized formulas aren't as good on this problem. What did I parameterize them for? Was it well justified?
(Probably not)
At this point, I plan on using the git branch "feature-normal-science-framework" to implement a system that takes in a problem set and provides several answer distributions as output.
Then, I'll do a massive cross-formula answer distribution comparison with $\chi^2$ tests. This will give me an idea about which formula and which changes are best.
I'll also be able to compare all of these answer distributions to the frequencies obtained in temperature removal branches of the repository.
\subsection{Steps/plan}
Normal Science:
\begin{enumerate}
\item Introduce statistical techniques
\item Reduce magic number usage, document reasoning and math
\item Propose effective human subject comparison
\end{enumerate}
Temperature:
\begin{enumerate}
\item Propose formula improvements
\item Experiment with a destructive removal of temperature
\item Experiment with a "surgical" removal of temperature
\item Assess different copycat versions with/without temperature
\end{enumerate}
\subsection{Semi-structured Notes}
Biological or psychological plausibility only matters if it actually affects the presence of intelligent processes. For example, neurons don't exist in copycat because we feel that they aren't required to simulate the processes being studied. Instead, copycat uses higher-level structures to simulate the same emergent processes that neurons do. However, codelets and the control of them relies on a global function representing tolerance to irrelevant structures. Other higher level structures in copycat likely rely on globals as well. Another central variable in copycat is the "rule" structure, of which there is only one. While some global variables might be viable, others may actually obstruct the ability to model intelligent processes. For example, a distributed notion of temperature will not only increase biological and psychological plausibility, but increase copycat's effectiveness at producing acceptable answer distributions.
We must also realize that copycat is only a model, so even if we take goals (level of abstraction) and biological plausibility into account...
It is only worth changing temperature if it affects the model.
Arguably, it does affect the model. (Or, rather, we hypothesize that it does. There is only one way to find out for sure, and that's the point of this paper)
So, maybe this is a paper about goals, model accuracy, and an attempt to find which cognitive details matter and which don't. It also might provide some insight into making a "Normal Science" framework.
Copycat is full of random uncommented parameters and formulas. Personally, I would advocate for removing or at least documenting as many of these as possible. In an ideal model, all of the numbers present might be either from existing mathematical formulas, or present for a very good (emergent and explainable - so that no other number would make sense in the same place) reason. However, settling on so called "magic" numbers because the authors of the program believed that their parameterizations were correct is very dangerous. If we removed random magic numbers, we would gain confidence in our model, progress towards a normal science, and gain a better understanding of cognitive processes.
Similarly, a lot of the testing of copycat is based on human perception of answer distributions. However, I suggest that we move to a more statistical approach. For example, deciding on some arbitrary baseline answer distribution and then modifying copycat to obtain other answer distributions and then comparing distributions with a statistical significance test would actually be indicative of what effect each change had. This paper will include code changes and proposals that lead copycat (and FARG projects in general) to a more statistical and verifiable approach.
While there is a good argument about copycat representing an individual with biases and therefore being incomparable to a distributed group of individuals, I believe that additional effort should be made to test copycat against human subjects. I may include in this paper a concrete proposal on how such an experiment might be done.
Let's simply test the hypothesis: \[H_i\] Copycat will have an improved (significantly different with increased frequencies of more desirable answers and decreased frequencies of less desirable answers: desirability will be determined by some concrete metric, such as the number of relationships that are preserved or mirrored) answer distribution if temperature is turned to a set of distributed metrics. \[H_0\] Copycat's answer distribution will be unaffected by changing temperature to a set of distributed metrics.
\subsection{Random Notes}
This is all just free-flow unstructured notes. Don't take anything too seriously :).
Below are a list of relevant primary and secondary sources I am reviewing:
Biological/Psychological Plausibility:
\begin{verbatim}
http://www.cell.com/trends/cognitive-sciences/abstract/S1364-6613(16)30217-0
"There is no evidence for a single site of working memory storage."
https://ekmillerlab.mit.edu/2017/01/10/the-distributed-nature-of-working-memory/
Creativity as a distributed process (SECONDARY: Review primaries)
https://blogs.scientificamerican.com/beautiful-minds/the-real-neuroscience-of-creativity/
cognition results from the dynamic interactions of distributed brain areas operating in large-scale networks
http://scottbarrykaufman.com/wp-content/uploads/2013/08/Bressler_Large-Scale_Brain_10.pdf
\end{verbatim}
\bibliographystyle{alpha}
\bibliography{sample}
\end{document}